diff options
Diffstat (limited to 'third_party/rust/rust_cascade')
15 files changed, 1656 insertions, 0 deletions
diff --git a/third_party/rust/rust_cascade/.cargo-checksum.json b/third_party/rust/rust_cascade/.cargo-checksum.json new file mode 100644 index 0000000000..963fd724ce --- /dev/null +++ b/third_party/rust/rust_cascade/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"Cargo.toml":"c523c5156c5d1fe20facdf880a0bab82235bb36a0e60e89c04cfa8fcd4c8ed90","README.md":"a4396d1adf63a77ae9aa0d1d850d02d09eec4a92810a52d675163688f312b3e8","license.txt":"1f256ecad192880510e84ad60474eab7589218784b9a50bc7ceee34c2b91f1d5","src/lib.rs":"30f2fc8a98641d6382e0bf9990b62959d445eeb7f2418f04352109a85eaee555","test_data/make-sample-data.py":"7b9f3efda7d1043eaa32619d9bdc4904db5563b0132815a035211037ab1e028a","test_data/requirements.txt":"cb9372b33ed2774e0d5040459fd63a2f9abae2be599869be43a2a077b2c08aa3","test_data/test_v1_murmur_mlbf":"243df0b7f2f55bfe3cefbba2d4be5eb7957c0a063559c9f284ca4c1ee4211eb5","test_data/test_v1_murmur_short_mlbf":"3d4f03dc0a65cf5800efed6ac0b3c73e5b61e5d62bc82ac42744abc67f4c30fa","test_data/test_v2_murmur_inverted_mlbf":"8f72bc1ca79194026fb2f7335a21f8c61636278a91291122148ad8f1aa8917a2","test_data/test_v2_murmur_mlbf":"83dce93d1147b38ca94548ff52552688caf2ece8388b2b5ea3fff1cb67e1396e","test_data/test_v2_sha256ctr_salt_mlbf":"31970e184563c31f39cd52410c6de1e49924d175df472af441a5aacb016240c3","test_data/test_v2_sha256l32_inverted_mlbf":"96399e30463a761a3f5ae0d0e1d57738b05f231cf6d5c2fe1db6bd9d33fab992","test_data/test_v2_sha256l32_mlbf":"0bd6630ef9a900861af419b496657648ab84b8c134a6fc2484d8e1cf11820ec1","test_data/test_v2_sha256l32_salt_mlbf":"bed34a636baca454f37f66d8314d7891722c8a8b029a0e4e0cf1f6ba176ee2c8"},"package":"ef248456c30c6607f1eb1e5d11025367b3340e235314dd33d2b31b41b35ac335"}
\ No newline at end of file diff --git a/third_party/rust/rust_cascade/Cargo.toml b/third_party/rust/rust_cascade/Cargo.toml new file mode 100644 index 0000000000..a33da9b4c1 --- /dev/null +++ b/third_party/rust/rust_cascade/Cargo.toml @@ -0,0 +1,34 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +name = "rust_cascade" +version = "1.4.0" +authors = ["Mark Goodwin <mgoodwin@mozilla.com>", "Dana Keeler <dkeeler@mozilla.com>", "J.C. Jones <jc@mozilla.com>", "John Schanck <jschanck@mozilla.com>"] +description = "A simple bloom filter cascade implementation in Rust." +homepage = "https://github.com/mozilla/rust-cascade" +documentation = "https://docs.rs/rust_cascade/" +license = "MPL-2.0" +repository = "https://github.com/mozilla/rust-cascade" +[dependencies.byteorder] +version = "1.3.1" + +[dependencies.murmurhash3] +version = "0.0.5" + +[dependencies.rand] +version = "0.7.3" + +[dependencies.sha2] +version = "0.10.2" + +[features] +builder = [] diff --git a/third_party/rust/rust_cascade/README.md b/third_party/rust/rust_cascade/README.md new file mode 100644 index 0000000000..206bff9267 --- /dev/null +++ b/third_party/rust/rust_cascade/README.md @@ -0,0 +1,12 @@ +# rust-cascade +A Bloom filter cascade implementation in rust. This can utilize one of two hash +functions: + +* MurmurHash32, or +* SHA256, with an optional salt + +This implementation is designed to match up with the Python [filter-cascade +project](https://pypi.org/project/filtercascade/) +[[github](https://github.com/mozilla/filter-cascade)] + +See tests in src/lib.rs to get an idea of usage. diff --git a/third_party/rust/rust_cascade/license.txt b/third_party/rust/rust_cascade/license.txt new file mode 100644 index 0000000000..a612ad9813 --- /dev/null +++ b/third_party/rust/rust_cascade/license.txt @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/third_party/rust/rust_cascade/src/lib.rs b/third_party/rust/rust_cascade/src/lib.rs new file mode 100644 index 0000000000..eef8e1f97d --- /dev/null +++ b/third_party/rust/rust_cascade/src/lib.rs @@ -0,0 +1,1129 @@ +//! # rust-cascade +//! +//! A library for creating and querying the cascading bloom filters described by +//! Larisch, Choffnes, Levin, Maggs, Mislove, and Wilson in +//! "CRLite: A Scalable System for Pushing All TLS Revocations to All Browsers" +//! <https://www.ieee-security.org/TC/SP2017/papers/567.pdf> + +extern crate byteorder; +extern crate murmurhash3; +extern crate rand; +extern crate sha2; + +use byteorder::{ByteOrder, LittleEndian, ReadBytesExt}; +use murmurhash3::murmurhash3_x86_32; +#[cfg(feature = "builder")] +use rand::rngs::OsRng; +#[cfg(feature = "builder")] +use rand::RngCore; +use sha2::{Digest, Sha256}; +use std::convert::{TryFrom, TryInto}; +use std::fmt; +use std::io::{ErrorKind, Read}; +use std::mem::size_of; + +#[derive(Debug)] +pub enum CascadeError { + LongSalt, + TooManyLayers, + Collision, + UnknownHashFunction, + CapacityViolation(&'static str), + Parse(&'static str), +} + +impl fmt::Display for CascadeError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + CascadeError::LongSalt => { + write!(f, "Cannot serialize a filter with a salt of length >= 256.") + } + CascadeError::TooManyLayers => { + write!(f, "Cannot serialize a filter with >= 255 layers.") + } + CascadeError::Collision => { + write!(f, "Collision between included and excluded sets.") + } + CascadeError::UnknownHashFunction => { + write!(f, "Unknown hash function.") + } + CascadeError::CapacityViolation(function) => { + write!(f, "Unexpected call to {}", function) + } + CascadeError::Parse(reason) => { + write!(f, "Cannot parse cascade: {}", reason) + } + } + } +} + +/// A Bloom filter representing a specific layer in a multi-layer cascading Bloom filter. +/// The same hash function is used for all layers, so it is not encoded here. +struct Bloom { + /// How many hash functions this filter uses + n_hash_funcs: u32, + /// The bit length of the filter + size: u32, + /// The data of the filter + data: Vec<u8>, +} + +#[repr(u8)] +#[derive(Copy, Clone, PartialEq)] +/// These enumerations need to match the python filter-cascade project: +/// <https://github.com/mozilla/filter-cascade/blob/v0.3.0/filtercascade/fileformats.py> +pub enum HashAlgorithm { + MurmurHash3 = 1, + Sha256l32 = 2, // low 32 bits of sha256 + Sha256 = 3, // all 256 bits of sha256 +} + +impl fmt::Display for HashAlgorithm { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", *self as u8) + } +} + +impl TryFrom<u8> for HashAlgorithm { + type Error = CascadeError; + fn try_from(value: u8) -> Result<HashAlgorithm, CascadeError> { + match value { + // Naturally, these need to match the enum declaration + 1 => Ok(Self::MurmurHash3), + 2 => Ok(Self::Sha256l32), + 3 => Ok(Self::Sha256), + _ => Err(CascadeError::UnknownHashFunction), + } + } +} + +/// A CascadeIndexGenerator provides one-time access to a table of pseudorandom functions H_ij +/// in which each function is of the form +/// H(s: &[u8], r: u32) -> usize +/// and for which 0 <= H(s,r) < r for all s, r. +/// The pseudorandom functions share a common key, represented as a octet string, and the table can +/// be constructed from this key alone. The functions are pseudorandom with respect to s, but not +/// r. For a uniformly random key/table, fixed r, and arbitrary strings m0 and m1, +/// H_ij(m0, r) is computationally indistinguishable from H_ij(m1,r) +/// for all i,j. +/// +/// A call to next_layer() increments i and resets j. +/// A call to next_index(s, r) increments j, and outputs some value H_ij(s) with 0 <= H_ij(s) < r. + +#[derive(Debug)] +enum CascadeIndexGenerator { + MurmurHash3 { + key: Vec<u8>, + counter: u32, + depth: u8, + }, + Sha256l32 { + key: Vec<u8>, + counter: u32, + depth: u8, + }, + Sha256Ctr { + key: Vec<u8>, + counter: u32, + state: [u8; 32], + state_available: u8, + }, +} + +impl PartialEq for CascadeIndexGenerator { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + ( + CascadeIndexGenerator::MurmurHash3 { key: ref a, .. }, + CascadeIndexGenerator::MurmurHash3 { key: ref b, .. }, + ) + | ( + CascadeIndexGenerator::Sha256l32 { key: ref a, .. }, + CascadeIndexGenerator::Sha256l32 { key: ref b, .. }, + ) + | ( + CascadeIndexGenerator::Sha256Ctr { key: ref a, .. }, + CascadeIndexGenerator::Sha256Ctr { key: ref b, .. }, + ) => a == b, + _ => false, + } + } +} + +impl CascadeIndexGenerator { + fn new(hash_alg: HashAlgorithm, key: Vec<u8>) -> Self { + match hash_alg { + HashAlgorithm::MurmurHash3 => Self::MurmurHash3 { + key, + counter: 0, + depth: 1, + }, + HashAlgorithm::Sha256l32 => Self::Sha256l32 { + key, + counter: 0, + depth: 1, + }, + HashAlgorithm::Sha256 => Self::Sha256Ctr { + key, + counter: 0, + state: [0; 32], + state_available: 0, + }, + } + } + + fn next_layer(&mut self) { + match self { + Self::MurmurHash3 { + ref mut counter, + ref mut depth, + .. + } + | Self::Sha256l32 { + ref mut counter, + ref mut depth, + .. + } => { + *counter = 0; + *depth += 1; + } + Self::Sha256Ctr { .. } => (), + } + } + + fn next_index(&mut self, salt: &[u8], range: u32) -> usize { + let index = match self { + Self::MurmurHash3 { + key, + ref mut counter, + depth, + } => { + let hash_seed = (*counter << 16) + *depth as u32; + *counter += 1; + murmurhash3_x86_32(key, hash_seed) + } + + Self::Sha256l32 { + key, + ref mut counter, + depth, + } => { + let mut hasher = Sha256::new(); + hasher.update(salt); + hasher.update(counter.to_le_bytes()); + hasher.update(depth.to_le_bytes()); + hasher.update(&key); + *counter += 1; + u32::from_le_bytes( + hasher.finalize()[0..4] + .try_into() + .expect("sha256 should have given enough bytes"), + ) + } + + Self::Sha256Ctr { + key, + ref mut counter, + ref mut state, + ref mut state_available, + } => { + // |bytes_needed| is the minimum number of bytes needed to represent a value in [0, range). + let bytes_needed = ((range.next_power_of_two().trailing_zeros() + 7) / 8) as usize; + let mut index_arr = [0u8; 4]; + for byte in index_arr.iter_mut().take(bytes_needed) { + if *state_available == 0 { + let mut hasher = Sha256::new(); + hasher.update(counter.to_le_bytes()); + hasher.update(salt); + hasher.update(&key); + hasher.finalize_into(state.into()); + *state_available = state.len() as u8; + *counter += 1; + } + *byte = state[state.len() - *state_available as usize]; + *state_available -= 1; + } + LittleEndian::read_u32(&index_arr) + } + }; + (index % range) as usize + } +} + +impl Bloom { + /// `new_crlite_bloom` creates an empty bloom filter for a layer of a cascade with the + /// parameters specified in [LCL+17, Section III.C]. + /// + /// # Arguments + /// * `include_capacity` - the number of elements that will be encoded at the new layer. + /// * `exclude_capacity` - the number of elements in the complement of the encoded set. + /// * `top_layer` - whether this is the top layer of the filter. + #[cfg(feature = "builder")] + pub fn new_crlite_bloom( + include_capacity: usize, + exclude_capacity: usize, + top_layer: bool, + ) -> Self { + assert!(include_capacity != 0 && exclude_capacity != 0); + + let r = include_capacity as f64; + let s = exclude_capacity as f64; + + // The desired false positive rate for the top layer is + // p = r/(sqrt(2)*s). + // With this setting, the number of false positives (which will need to be + // encoded at the second layer) is expected to be a factor of sqrt(2) + // smaller than the number of elements encoded at the top layer. + // + // At layer i > 1 we try to ensure that the number of elements to be + // encoded at layer i+1 is half the number of elements encoded at + // layer i. So we take p = 1/2. + let log2_fp_rate = match top_layer { + true => (r / s).log2() - 0.5f64, + false => -1f64, + }; + + // the number of hash functions (k) and the size of the bloom filter (m) are given in + // [LCL+17] as k = log2(1/p) and m = r log2(1/p) / ln(2). + // + // If this formula gives a value of m < 256, we take m=256 instead. This results in very + // slightly sub-optimal size, but gives us the added benefit of doing less hashing. + let n_hash_funcs = (-log2_fp_rate).round() as u32; + let size = match (r * (-log2_fp_rate) / (f64::ln(2f64))).round() as u32 { + size if size >= 256 => size, + _ => 256, + }; + + Bloom { + n_hash_funcs, + size, + data: vec![0u8; ((size + 7) / 8) as usize], + } + } + + /// `read` attempts to decode the Bloom filter represented by the bytes in the given reader. + /// + /// # Arguments + /// * `reader` - The encoded representation of this Bloom filter. May be empty. May include + /// additional data describing further Bloom filters. + /// The format of an encoded Bloom filter is: + /// [1 byte] - the hash algorithm to use in the filter + /// [4 little endian bytes] - the length in bits of the filter + /// [4 little endian bytes] - the number of hash functions to use in the filter + /// [1 byte] - which layer in the cascade this filter is + /// [variable length bytes] - the filter itself (must be of minimal length) + pub fn read<R: Read>( + reader: &mut R, + ) -> Result<Option<(Bloom, usize, HashAlgorithm)>, CascadeError> { + let hash_algorithm_val = match reader.read_u8() { + Ok(val) => val, + // If reader is at EOF, there is no bloom filter. + Err(e) if e.kind() == ErrorKind::UnexpectedEof => return Ok(None), + Err(_) => return Err(CascadeError::Parse("read error")), + }; + let hash_algorithm = HashAlgorithm::try_from(hash_algorithm_val)?; + + let size = reader + .read_u32::<byteorder::LittleEndian>() + .or(Err(CascadeError::Parse("truncated at layer size")))?; + let n_hash_funcs = reader + .read_u32::<byteorder::LittleEndian>() + .or(Err(CascadeError::Parse("truncated at layer hash count")))?; + let layer = reader + .read_u8() + .or(Err(CascadeError::Parse("truncated at layer number")))?; + + let byte_count = ((size + 7) / 8) as usize; + let mut data = vec![0; byte_count]; + reader + .read_exact(&mut data) + .or(Err(CascadeError::Parse("truncated at layer data")))?; + let bloom = Bloom { + n_hash_funcs, + size, + data, + }; + Ok(Some((bloom, layer as usize, hash_algorithm))) + } + + fn has(&self, generator: &mut CascadeIndexGenerator, salt: &[u8]) -> bool { + for _ in 0..self.n_hash_funcs { + let bit_index = generator.next_index(salt, self.size); + assert!(bit_index < self.size as usize); + let byte_index = bit_index / 8; + let mask = 1 << (bit_index % 8); + if self.data[byte_index] & mask == 0 { + return false; + } + } + true + } + + #[cfg(feature = "builder")] + fn insert(&mut self, generator: &mut CascadeIndexGenerator, salt: &[u8]) { + for _ in 0..self.n_hash_funcs { + let bit_index = generator.next_index(salt, self.size); + let byte_index = bit_index / 8; + let mask = 1 << (bit_index % 8); + self.data[byte_index] |= mask; + } + } + + pub fn approximate_size_of(&self) -> usize { + size_of::<Bloom>() + self.data.len() + } +} + +impl fmt::Display for Bloom { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "n_hash_funcs={} size={}", self.n_hash_funcs, self.size) + } +} + +/// A multi-layer cascading Bloom filter. +pub struct Cascade { + /// The Bloom filter for this layer in the cascade + filters: Vec<Bloom>, + /// The salt in use, if any + salt: Vec<u8>, + /// The hash algorithm / index generating function to use + hash_algorithm: HashAlgorithm, + /// Whether the logic should be inverted + inverted: bool, +} + +impl Cascade { + /// from_bytes attempts to decode and return a multi-layer cascading Bloom filter. + /// + /// # Arguments + /// `bytes` - The encoded representation of the Bloom filters in this cascade. Starts with 2 + /// little endian bytes indicating the version. The current version is 2. The Python + /// filter-cascade project defines the formats, see + /// <https://github.com/mozilla/filter-cascade/blob/v0.3.0/filtercascade/fileformats.py> + /// + /// May be of length 0, in which case `None` is returned. + pub fn from_bytes(bytes: Vec<u8>) -> Result<Option<Self>, CascadeError> { + if bytes.is_empty() { + return Ok(None); + } + let mut reader = bytes.as_slice(); + let version = reader + .read_u16::<byteorder::LittleEndian>() + .or(Err(CascadeError::Parse("truncated at version")))?; + + let mut filters = vec![]; + let mut salt = vec![]; + let mut top_hash_alg = None; + let mut inverted = false; + + if version > 2 { + return Err(CascadeError::Parse("unknown version")); + } + + if version == 2 { + let inverted_val = reader + .read_u8() + .or(Err(CascadeError::Parse("truncated at inverted")))?; + if inverted_val > 1 { + return Err(CascadeError::Parse("invalid value for inverted")); + } + inverted = 0 != inverted_val; + let salt_len: usize = reader + .read_u8() + .or(Err(CascadeError::Parse("truncated at salt length")))? + .into(); + if salt_len >= 256 { + return Err(CascadeError::Parse("salt too long")); + } + if salt_len > 0 { + let mut salt_bytes = vec![0; salt_len]; + reader + .read_exact(&mut salt_bytes) + .or(Err(CascadeError::Parse("truncated at salt")))?; + salt = salt_bytes; + } + } + + while let Some((filter, layer_number, layer_hash_alg)) = Bloom::read(&mut reader)? { + filters.push(filter); + + if layer_number != filters.len() { + return Err(CascadeError::Parse("irregular layer numbering")); + } + + if *top_hash_alg.get_or_insert(layer_hash_alg) != layer_hash_alg { + return Err(CascadeError::Parse("Inconsistent hash algorithms")); + } + } + + if filters.is_empty() { + return Err(CascadeError::Parse("missing filters")); + } + + let hash_algorithm = top_hash_alg.ok_or(CascadeError::Parse("missing hash algorithm"))?; + + Ok(Some(Cascade { + filters, + salt, + hash_algorithm, + inverted, + })) + } + + /// to_bytes encodes a cascade in the version 2 format. + pub fn to_bytes(&self) -> Result<Vec<u8>, CascadeError> { + if self.salt.len() >= 256 { + return Err(CascadeError::LongSalt); + } + if self.filters.len() >= 255 { + return Err(CascadeError::TooManyLayers); + } + let mut out = vec![]; + let version: u16 = 2; + let inverted: u8 = self.inverted.into(); + let salt_len: u8 = self.salt.len() as u8; + let hash_alg: u8 = self.hash_algorithm as u8; + out.extend_from_slice(&version.to_le_bytes()); + out.push(inverted); + out.push(salt_len); + out.extend_from_slice(&self.salt); + for (layer, bloom) in self.filters.iter().enumerate() { + out.push(hash_alg); + out.extend_from_slice(&bloom.size.to_le_bytes()); + out.extend_from_slice(&bloom.n_hash_funcs.to_le_bytes()); + out.push((1 + layer) as u8); // 1-indexed + out.extend_from_slice(&bloom.data); + } + Ok(out) + } + + /// has determines if the given sequence of bytes is in the cascade. + /// + /// # Arguments + /// `entry` - The bytes to query + pub fn has(&self, entry: Vec<u8>) -> bool { + // Query filters 0..self.filters.len() until we get a non-membership result. + // If this occurs at an even index filter, the element *is not* included. + // ... at an odd-index filter, the element *is* included. + let mut generator = CascadeIndexGenerator::new(self.hash_algorithm, entry); + let mut rv = false; + for filter in &self.filters { + if filter.has(&mut generator, &self.salt) { + rv = !rv; + generator.next_layer(); + } else { + break; + } + } + if self.inverted { + rv = !rv; + } + rv + } + + pub fn invert(&mut self) { + self.inverted = !self.inverted; + } + + /// Determine the approximate amount of memory in bytes used by this + /// Cascade. Because this implementation does not integrate with the + /// allocator, it can't get an accurate measurement of how much memory it + /// uses. However, it can make a reasonable guess, assuming the sizes of + /// the bloom filters are large enough to dominate the overall allocated + /// size. + pub fn approximate_size_of(&self) -> usize { + size_of::<Cascade>() + + self + .filters + .iter() + .map(|x| x.approximate_size_of()) + .sum::<usize>() + + self.salt.len() + } +} + +impl fmt::Display for Cascade { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + writeln!( + f, + "salt={:?} inverted={} hash_algorithm={}", + self.salt, self.inverted, self.hash_algorithm, + )?; + for filter in &self.filters { + writeln!(f, "\t[{}]", filter)?; + } + Ok(()) + } +} + +/// A CascadeBuilder creates a Cascade with layers given by `Bloom::new_crlite_bloom`. +/// +/// A builder is initialized using [`CascadeBuilder::default`] or [`CascadeBuilder::new`]. Prefer `default`. The `new` constructor +/// allows the user to specify sensitive internal details such as the hash function and the domain +/// separation parameter. +/// +/// Both constructors take `include_capacity` and an `exclude_capacity` parameters. The +/// `include_capacity` is the number of elements that will be encoded in the Cascade. The +/// `exclude_capacity` is size of the complement of the encoded set. +/// +/// The encoded set is specified through calls to [`CascadeBuilder::include`]. Its complement is specified through +/// calls to [`CascadeBuilder::exclude`]. The cascade is built with a call to [`CascadeBuilder::finalize`]. +/// +/// The builder will track of the number of calls to `include` and `exclude`. +/// The caller is responsible for making *exactly* `include_capacity` calls to `include` +/// followed by *exactly* `exclude_capacity` calls to `exclude`. +/// Calling `exclude` before all `include` calls have been made will result in a panic!(). +/// Calling `finalize` before all `exclude` calls have been made will result in a panic!(). +/// +#[cfg(feature = "builder")] +pub struct CascadeBuilder { + filters: Vec<Bloom>, + salt: Vec<u8>, + hash_algorithm: HashAlgorithm, + to_include: Vec<CascadeIndexGenerator>, + to_exclude: Vec<CascadeIndexGenerator>, + status: BuildStatus, +} + +#[cfg(feature = "builder")] +impl CascadeBuilder { + pub fn default(include_capacity: usize, exclude_capacity: usize) -> Self { + let mut salt = vec![0u8; 16]; + OsRng.fill_bytes(&mut salt); + CascadeBuilder::new( + HashAlgorithm::Sha256, + salt, + include_capacity, + exclude_capacity, + ) + } + + pub fn new( + hash_algorithm: HashAlgorithm, + salt: Vec<u8>, + include_capacity: usize, + exclude_capacity: usize, + ) -> Self { + CascadeBuilder { + filters: vec![Bloom::new_crlite_bloom( + include_capacity, + exclude_capacity, + true, + )], + salt, + to_include: vec![], + to_exclude: vec![], + hash_algorithm, + status: BuildStatus(include_capacity, exclude_capacity), + } + } + + pub fn include(&mut self, item: Vec<u8>) -> Result<(), CascadeError> { + match self.status { + BuildStatus(ref mut cap, _) if *cap > 0 => *cap -= 1, + _ => return Err(CascadeError::CapacityViolation("include")), + } + let mut generator = CascadeIndexGenerator::new(self.hash_algorithm, item); + self.filters[0].insert(&mut generator, &self.salt); + self.to_include.push(generator); + + Ok(()) + } + + pub fn exclude(&mut self, item: Vec<u8>) -> Result<(), CascadeError> { + match self.status { + BuildStatus(0, ref mut cap) if *cap > 0 => *cap -= 1, + _ => return Err(CascadeError::CapacityViolation("exclude")), + } + let mut generator = CascadeIndexGenerator::new(self.hash_algorithm, item); + if self.filters[0].has(&mut generator, &self.salt) { + self.to_exclude.push(generator); + } + Ok(()) + } + + /// `exclude_threaded` is like `exclude` but it stores false positives in a caller-owned + /// `ExcludeSet`. This allows the caller to exclude items in parallel. + pub fn exclude_threaded(&self, exclude_set: &mut ExcludeSet, item: Vec<u8>) { + exclude_set.size += 1; + let mut generator = CascadeIndexGenerator::new(self.hash_algorithm, item); + if self.filters[0].has(&mut generator, &self.salt) { + exclude_set.set.push(generator); + } + } + + /// `collect_exclude_set` merges an `ExcludeSet` into the internal storage of the CascadeBuilder. + pub fn collect_exclude_set( + &mut self, + exclude_set: &mut ExcludeSet, + ) -> Result<(), CascadeError> { + match self.status { + BuildStatus(0, ref mut cap) if *cap >= exclude_set.size => *cap -= exclude_set.size, + _ => return Err(CascadeError::CapacityViolation("exclude")), + } + self.to_exclude.append(&mut exclude_set.set); + + Ok(()) + } + + fn push_layer(&mut self) -> Result<(), CascadeError> { + // At even layers we encode elements of to_include. At odd layers we encode elements of + // to_exclude. In both cases, we track false positives by filtering the complement of the + // encoded set through the newly produced bloom filter. + let at_even_layer = self.filters.len() % 2 == 0; + let (to_encode, to_filter) = match at_even_layer { + true => (&mut self.to_include, &mut self.to_exclude), + false => (&mut self.to_exclude, &mut self.to_include), + }; + + // split ownership of `salt` away from `to_encode` and `to_filter` + // We need an immutable reference to salt during `to_encode.iter_mut()` + let mut bloom = Bloom::new_crlite_bloom(to_encode.len(), to_filter.len(), false); + + let salt = self.salt.as_slice(); + + to_encode.iter_mut().for_each(|x| { + x.next_layer(); + bloom.insert(x, salt) + }); + + let mut delta = to_filter.len(); + to_filter.retain_mut(|x| { + x.next_layer(); + bloom.has(x, salt) + }); + delta -= to_filter.len(); + + if delta == 0 { + // Check for collisions between the |to_encode| and |to_filter| sets. + // The implementation of PartialEq for CascadeIndexGenerator will successfully + // identify cases where the user called |include(item)| and |exclude(item)| for the + // same item. It will not identify collisions in the underlying hash function. + for x in to_encode.iter_mut() { + if to_filter.contains(x) { + return Err(CascadeError::Collision); + } + } + } + + self.filters.push(bloom); + Ok(()) + } + + pub fn finalize(mut self) -> Result<Box<Cascade>, CascadeError> { + match self.status { + BuildStatus(0, 0) => (), + _ => return Err(CascadeError::CapacityViolation("finalize")), + } + + loop { + if self.to_exclude.is_empty() { + break; + } + self.push_layer()?; + + if self.to_include.is_empty() { + break; + } + self.push_layer()?; + } + + Ok(Box::new(Cascade { + filters: self.filters, + salt: self.salt, + hash_algorithm: self.hash_algorithm, + inverted: false, + })) + } +} + +/// BuildStatus is used to ensure that the `include`, `exclude`, and `finalize` calls to +/// CascadeBuilder are made in the right order. The (a,b) state indicates that the +/// CascadeBuilder is waiting for `a` calls to `include` and `b` calls to `exclude`. +#[cfg(feature = "builder")] +struct BuildStatus(usize, usize); + +/// CascadeBuilder::exclude takes `&mut self` so that it can count exclusions and push items to +/// self.to_exclude. The bulk of the work it does, however, can be done with an immutable reference +/// to the top level bloom filter. An `ExcludeSet` is used by `CascadeBuilder::exclude_threaded` to +/// track the changes to a `CascadeBuilder` that would be made with a call to +/// `CascadeBuilder::exclude`. +#[cfg(feature = "builder")] +#[derive(Default)] +pub struct ExcludeSet { + size: usize, + set: Vec<CascadeIndexGenerator>, +} + +#[cfg(test)] +mod tests { + use Bloom; + use Cascade; + #[cfg(feature = "builder")] + use CascadeBuilder; + #[cfg(feature = "builder")] + use CascadeError; + use CascadeIndexGenerator; + #[cfg(feature = "builder")] + use ExcludeSet; + use HashAlgorithm; + + #[test] + fn bloom_v1_test_from_bytes() { + let src: Vec<u8> = vec![ + 0x01, 0x09, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x41, 0x00, + ]; + let mut reader = src.as_slice(); + + match Bloom::read(&mut reader) { + Ok(Some((bloom, 1, HashAlgorithm::MurmurHash3))) => { + assert!(bloom.has( + &mut CascadeIndexGenerator::new(HashAlgorithm::MurmurHash3, b"this".to_vec()), + &vec![] + )); + assert!(bloom.has( + &mut CascadeIndexGenerator::new(HashAlgorithm::MurmurHash3, b"that".to_vec()), + &vec![] + )); + assert!(!bloom.has( + &mut CascadeIndexGenerator::new(HashAlgorithm::MurmurHash3, b"other".to_vec()), + &vec![] + )); + } + Ok(_) => panic!("Parsing failed"), + Err(_) => panic!("Parsing failed"), + }; + assert!(reader.is_empty()); + + let short: Vec<u8> = vec![ + 0x01, 0x09, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x41, + ]; + assert!(Bloom::read(&mut short.as_slice()).is_err()); + + let empty: Vec<u8> = Vec::new(); + let mut reader = empty.as_slice(); + match Bloom::read(&mut reader) { + Ok(should_be_none) => assert!(should_be_none.is_none()), + Err(_) => panic!("Parsing failed"), + }; + } + + #[test] + fn bloom_v3_unsupported() { + let src: Vec<u8> = vec![0x03, 0x01, 0x00]; + assert!(Bloom::read(&mut src.as_slice()).is_err()); + } + + #[test] + fn cascade_v1_murmur_from_file_bytes_test() { + let v = include_bytes!("../test_data/test_v1_murmur_mlbf").to_vec(); + let cascade = Cascade::from_bytes(v) + .expect("parsing Cascade should succeed") + .expect("Cascade should be Some"); + // Key format is SHA256(issuer SPKI) + serial number + let key_for_revoked_cert_1 = vec![ + 0x2e, 0xb2, 0xd5, 0xa8, 0x60, 0xfe, 0x50, 0xe9, 0xc2, 0x42, 0x36, 0x85, 0x52, 0x98, + 0x01, 0x50, 0xe4, 0x5d, 0xb5, 0x32, 0x1a, 0x5b, 0x00, 0x5e, 0x26, 0xd6, 0x76, 0x25, + 0x3a, 0x40, 0x9b, 0xf5, 0x06, 0x2d, 0xf5, 0x68, 0xa0, 0x51, 0x31, 0x08, 0x20, 0xd7, + 0xec, 0x43, 0x27, 0xe1, 0xba, 0xfd, + ]; + assert!(cascade.has(key_for_revoked_cert_1)); + let key_for_revoked_cert_2 = vec![ + 0xf1, 0x1c, 0x3d, 0xd0, 0x48, 0xf7, 0x4e, 0xdb, 0x7c, 0x45, 0x19, 0x2b, 0x83, 0xe5, + 0x98, 0x0d, 0x2f, 0x67, 0xec, 0x84, 0xb4, 0xdd, 0xb9, 0x39, 0x6e, 0x33, 0xff, 0x51, + 0x73, 0xed, 0x69, 0x8f, 0x00, 0xd2, 0xe8, 0xf6, 0xaa, 0x80, 0x48, 0x1c, 0xd4, + ]; + assert!(cascade.has(key_for_revoked_cert_2)); + let key_for_valid_cert = vec![ + 0x99, 0xfc, 0x9d, 0x40, 0xf1, 0xad, 0xb1, 0x63, 0x65, 0x61, 0xa6, 0x1d, 0x68, 0x3d, + 0x9e, 0xa6, 0xb4, 0x60, 0xc5, 0x7d, 0x0c, 0x75, 0xea, 0x00, 0xc3, 0x41, 0xb9, 0xdf, + 0xb9, 0x0b, 0x5f, 0x39, 0x0b, 0x77, 0x75, 0xf7, 0xaf, 0x9a, 0xe5, 0x42, 0x65, 0xc9, + 0xcd, 0x32, 0x57, 0x10, 0x77, 0x8e, + ]; + assert!(!cascade.has(key_for_valid_cert)); + + assert_eq!(cascade.approximate_size_of(), 15408); + + let v = include_bytes!("../test_data/test_v1_murmur_short_mlbf").to_vec(); + assert!(Cascade::from_bytes(v).is_err()); + } + + #[test] + fn cascade_v2_sha256l32_from_file_bytes_test() { + let v = include_bytes!("../test_data/test_v2_sha256l32_mlbf").to_vec(); + let cascade = Cascade::from_bytes(v) + .expect("parsing Cascade should succeed") + .expect("Cascade should be Some"); + + assert!(cascade.salt.len() == 0); + assert!(cascade.inverted == false); + assert!(cascade.has(b"this".to_vec()) == true); + assert!(cascade.has(b"that".to_vec()) == true); + assert!(cascade.has(b"other".to_vec()) == false); + assert_eq!(cascade.approximate_size_of(), 1001); + } + + #[test] + fn cascade_v2_sha256l32_with_salt_from_file_bytes_test() { + let v = include_bytes!("../test_data/test_v2_sha256l32_salt_mlbf").to_vec(); + let cascade = Cascade::from_bytes(v) + .expect("parsing Cascade should succeed") + .expect("Cascade should be Some"); + + assert!(cascade.salt == b"nacl".to_vec()); + assert!(cascade.inverted == false); + assert!(cascade.has(b"this".to_vec()) == true); + assert!(cascade.has(b"that".to_vec()) == true); + assert!(cascade.has(b"other".to_vec()) == false); + assert_eq!(cascade.approximate_size_of(), 1001); + } + + #[test] + fn cascade_v2_murmur_from_file_bytes_test() { + let v = include_bytes!("../test_data/test_v2_murmur_mlbf").to_vec(); + let cascade = Cascade::from_bytes(v) + .expect("parsing Cascade should succeed") + .expect("Cascade should be Some"); + + assert!(cascade.salt.len() == 0); + assert!(cascade.inverted == false); + assert!(cascade.has(b"this".to_vec()) == true); + assert!(cascade.has(b"that".to_vec()) == true); + assert!(cascade.has(b"other".to_vec()) == false); + assert_eq!(cascade.approximate_size_of(), 992); + } + + #[test] + fn cascade_v2_murmur_inverted_from_file_bytes_test() { + let v = include_bytes!("../test_data/test_v2_murmur_inverted_mlbf").to_vec(); + let cascade = Cascade::from_bytes(v) + .expect("parsing Cascade should succeed") + .expect("Cascade should be Some"); + + assert!(cascade.salt.len() == 0); + assert!(cascade.inverted == true); + assert!(cascade.has(b"this".to_vec()) == true); + assert!(cascade.has(b"that".to_vec()) == true); + assert!(cascade.has(b"other".to_vec()) == false); + assert_eq!(cascade.approximate_size_of(), 1058); + } + + #[test] + fn cascade_v2_sha256l32_inverted_from_file_bytes_test() { + let v = include_bytes!("../test_data/test_v2_sha256l32_inverted_mlbf").to_vec(); + let cascade = Cascade::from_bytes(v) + .expect("parsing Cascade should succeed") + .expect("Cascade should be Some"); + + assert!(cascade.salt.len() == 0); + assert!(cascade.inverted == true); + assert!(cascade.has(b"this".to_vec()) == true); + assert!(cascade.has(b"that".to_vec()) == true); + assert!(cascade.has(b"other".to_vec()) == false); + assert_eq!(cascade.approximate_size_of(), 1061); + } + + #[test] + fn cascade_v2_sha256ctr_from_file_bytes_test() { + let v = include_bytes!("../test_data/test_v2_sha256ctr_salt_mlbf").to_vec(); + let cascade = Cascade::from_bytes(v) + .expect("parsing Cascade should succeed") + .expect("Cascade should be Some"); + + assert!(cascade.salt == b"nacl".to_vec()); + assert!(cascade.inverted == false); + assert!(cascade.has(b"this".to_vec()) == true); + assert!(cascade.has(b"that".to_vec()) == true); + assert!(cascade.has(b"other".to_vec()) == false); + assert_eq!(cascade.approximate_size_of(), 1070); + } + + #[test] + fn cascade_empty() { + let cascade = Cascade::from_bytes(Vec::new()).expect("parsing Cascade should succeed"); + assert!(cascade.is_none()); + } + + #[test] + fn cascade_test_from_bytes() { + let unknown_version: Vec<u8> = vec![0xff, 0xff, 0x00, 0x00]; + match Cascade::from_bytes(unknown_version) { + Ok(_) => panic!("Cascade::from_bytes allows unknown version."), + Err(_) => (), + } + + let first_layer_is_zero: Vec<u8> = vec![ + 0x01, 0x00, 0x01, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + ]; + match Cascade::from_bytes(first_layer_is_zero) { + Ok(_) => panic!("Cascade::from_bytes allows zero indexed layers."), + Err(_) => (), + } + + let second_layer_is_three: Vec<u8> = vec![ + 0x01, 0x00, 0x01, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, + 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, + ]; + match Cascade::from_bytes(second_layer_is_three) { + Ok(_) => panic!("Cascade::from_bytes allows non-sequential layers."), + Err(_) => (), + } + } + + #[test] + #[cfg(feature = "builder")] + fn cascade_builder_test_collision() { + let mut builder = CascadeBuilder::default(1, 1); + builder.include(b"collision!".to_vec()).ok(); + builder.exclude(b"collision!".to_vec()).ok(); + assert!(matches!(builder.finalize(), Err(CascadeError::Collision))); + } + + #[test] + #[cfg(feature = "builder")] + fn cascade_builder_test_exclude_too_few() { + let mut builder = CascadeBuilder::default(1, 1); + builder.include(b"1".to_vec()).ok(); + assert!(matches!( + builder.finalize(), + Err(CascadeError::CapacityViolation(_)) + )); + } + + #[test] + #[cfg(feature = "builder")] + fn cascade_builder_test_include_too_few() { + let mut builder = CascadeBuilder::default(1, 1); + assert!(matches!( + builder.exclude(b"1".to_vec()), + Err(CascadeError::CapacityViolation(_)) + )); + } + + #[test] + #[cfg(feature = "builder")] + fn cascade_builder_test_include_too_many() { + let mut builder = CascadeBuilder::default(1, 1); + builder.include(b"1".to_vec()).ok(); + assert!(matches!( + builder.include(b"2".to_vec()), + Err(CascadeError::CapacityViolation(_)) + )); + } + + #[test] + #[cfg(feature = "builder")] + fn cascade_builder_test_exclude_too_many() { + let mut builder = CascadeBuilder::default(1, 1); + builder.include(b"1".to_vec()).ok(); + builder.exclude(b"2".to_vec()).ok(); + assert!(matches!( + builder.exclude(b"3".to_vec()), + Err(CascadeError::CapacityViolation(_)) + )); + } + + #[test] + #[cfg(feature = "builder")] + fn cascade_builder_test_exclude_threaded_no_collect() { + let mut builder = CascadeBuilder::default(1, 3); + let mut exclude_set = ExcludeSet::default(); + builder.include(b"1".to_vec()).ok(); + builder.exclude_threaded(&mut exclude_set, b"2".to_vec()); + builder.exclude_threaded(&mut exclude_set, b"3".to_vec()); + builder.exclude_threaded(&mut exclude_set, b"4".to_vec()); + assert!(matches!( + builder.finalize(), + Err(CascadeError::CapacityViolation(_)) + )); + } + + #[test] + #[cfg(feature = "builder")] + fn cascade_builder_test_exclude_threaded_too_many() { + let mut builder = CascadeBuilder::default(1, 3); + let mut exclude_set = ExcludeSet::default(); + builder.include(b"1".to_vec()).ok(); + builder.exclude_threaded(&mut exclude_set, b"2".to_vec()); + builder.exclude_threaded(&mut exclude_set, b"3".to_vec()); + builder.exclude_threaded(&mut exclude_set, b"4".to_vec()); + builder.exclude_threaded(&mut exclude_set, b"5".to_vec()); + assert!(matches!( + builder.collect_exclude_set(&mut exclude_set), + Err(CascadeError::CapacityViolation(_)) + )); + } + + #[test] + #[cfg(feature = "builder")] + fn cascade_builder_test_exclude_threaded() { + let mut builder = CascadeBuilder::default(1, 3); + let mut exclude_set = ExcludeSet::default(); + builder.include(b"1".to_vec()).ok(); + builder.exclude_threaded(&mut exclude_set, b"2".to_vec()); + builder.exclude_threaded(&mut exclude_set, b"3".to_vec()); + builder.exclude_threaded(&mut exclude_set, b"4".to_vec()); + builder.collect_exclude_set(&mut exclude_set).ok(); + builder.finalize().ok(); + } + + #[cfg(feature = "builder")] + fn cascade_builder_test_generate(hash_alg: HashAlgorithm, inverted: bool) { + let total = 10_000_usize; + let included = 100_usize; + + let salt = vec![0u8; 16]; + let mut builder = + CascadeBuilder::new(hash_alg, salt, included, (total - included) as usize); + for i in 0..included { + builder.include(i.to_le_bytes().to_vec()).ok(); + } + for i in included..total { + builder.exclude(i.to_le_bytes().to_vec()).ok(); + } + let mut cascade = builder.finalize().unwrap(); + + if inverted { + cascade.invert() + } + + // Ensure we can serialize / deserialize + let cascade_bytes = cascade.to_bytes().expect("failed to serialize cascade"); + + let cascade = Cascade::from_bytes(cascade_bytes) + .expect("failed to deserialize cascade") + .expect("cascade should not be None here"); + + // Ensure each query gives the correct result + for i in 0..included { + assert!(cascade.has(i.to_le_bytes().to_vec()) == true ^ inverted) + } + for i in included..total { + assert!(cascade.has(i.to_le_bytes().to_vec()) == false ^ inverted) + } + } + + #[test] + #[cfg(feature = "builder")] + fn cascade_builder_test_generate_murmurhash3_inverted() { + cascade_builder_test_generate(HashAlgorithm::MurmurHash3, true); + } + + #[test] + #[cfg(feature = "builder")] + fn cascade_builder_test_generate_murmurhash3() { + cascade_builder_test_generate(HashAlgorithm::MurmurHash3, false); + } + + #[test] + #[cfg(feature = "builder")] + fn cascade_builder_test_generate_sha256l32() { + cascade_builder_test_generate(HashAlgorithm::Sha256l32, false); + } + + #[test] + #[cfg(feature = "builder")] + fn cascade_builder_test_generate_sha256() { + cascade_builder_test_generate(HashAlgorithm::Sha256, false); + } +} diff --git a/third_party/rust/rust_cascade/test_data/make-sample-data.py b/third_party/rust/rust_cascade/test_data/make-sample-data.py new file mode 100644 index 0000000000..c9f117da5f --- /dev/null +++ b/third_party/rust/rust_cascade/test_data/make-sample-data.py @@ -0,0 +1,106 @@ +import filtercascade +import hashlib +from pathlib import Path + +import sys +import logging + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + + +def predictable_serial_gen(start, end): + counter = start + while counter < end: + counter += 1 + m = hashlib.sha256() + m.update(counter.to_bytes(4, byteorder="big")) + yield m.hexdigest() + + +def store(fc, path): + if path.exists(): + path.unlink() + with open(path, "wb") as f: + fc.tofile(f) + + +small_set = list(set(predictable_serial_gen(0, 500))) +large_set = set(predictable_serial_gen(500, 10_000)) + +# filter parameters +growth_factor = 1.0 +min_filter_length = 177 # 177 * 1.44 ~ 256, so smallest filter will have 256 bits + +print("--- v2_sha256l32_with_salt ---") +v2_sha256l32_with_salt = filtercascade.FilterCascade( + [], + defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256, + salt=b"nacl", + growth_factor=growth_factor, + min_filter_length=min_filter_length, +) +v2_sha256l32_with_salt.initialize( + include=[b"this", b"that"] + small_set, exclude=large_set | set([b"other"]) +) +store(v2_sha256l32_with_salt, Path("test_v2_sha256l32_salt_mlbf")) + +print("--- v2_sha256l32 ---") +v2_sha256l32 = filtercascade.FilterCascade( + [], + defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256, + growth_factor=growth_factor, + min_filter_length=min_filter_length, +) +v2_sha256l32.initialize( + include=[b"this", b"that"] + small_set, exclude=large_set | set([b"other"]) +) +store(v2_sha256l32, Path("test_v2_sha256l32_mlbf")) + +print("--- v2_murmur ---") +v2_murmur = filtercascade.FilterCascade( + [], + defaultHashAlg=filtercascade.fileformats.HashAlgorithm.MURMUR3, + growth_factor=growth_factor, + min_filter_length=min_filter_length, +) +v2_murmur.initialize( + include=[b"this", b"that"] + small_set, exclude=large_set | set([b"other"]) +) +store(v2_murmur, Path("test_v2_murmur_mlbf")) + +print("--- v2_murmur_inverted ---") +v2_murmur_inverted = filtercascade.FilterCascade( + [], + defaultHashAlg=filtercascade.fileformats.HashAlgorithm.MURMUR3, + growth_factor=growth_factor, + min_filter_length=min_filter_length, +) +v2_murmur_inverted.initialize( + include=large_set | set([b"this", b"that"]), exclude=[b"other"] + small_set +) +store(v2_murmur_inverted, Path("test_v2_murmur_inverted_mlbf")) + +print("--- v2_sha256l32_inverted ---") +v2_sha256l32_inverted = filtercascade.FilterCascade( + [], + defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256, + growth_factor=growth_factor, + min_filter_length=min_filter_length, +) +v2_sha256l32_inverted.initialize( + include=large_set | set([b"this", b"that"]), exclude=[b"other"] + small_set +) +store(v2_sha256l32_inverted, Path("test_v2_sha256l32_inverted_mlbf")) + +print("--- v2_sha256ctr_with_salt ---") +v2_sha256ctr_with_salt = filtercascade.FilterCascade( + [], + defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256CTR, + salt=b"nacl", + growth_factor=growth_factor, + min_filter_length=min_filter_length, +) +v2_sha256ctr_with_salt.initialize( + include=[b"this", b"that"] + small_set, exclude=large_set | set([b"other"]) +) +store(v2_sha256ctr_with_salt, Path("test_v2_sha256ctr_salt_mlbf")) diff --git a/third_party/rust/rust_cascade/test_data/requirements.txt b/third_party/rust/rust_cascade/test_data/requirements.txt new file mode 100644 index 0000000000..f97bd4328f --- /dev/null +++ b/third_party/rust/rust_cascade/test_data/requirements.txt @@ -0,0 +1 @@ +filtercascade >= 0.3.0 diff --git a/third_party/rust/rust_cascade/test_data/test_v1_murmur_mlbf b/third_party/rust/rust_cascade/test_data/test_v1_murmur_mlbf Binary files differnew file mode 100644 index 0000000000..34ced4b840 --- /dev/null +++ b/third_party/rust/rust_cascade/test_data/test_v1_murmur_mlbf diff --git a/third_party/rust/rust_cascade/test_data/test_v1_murmur_short_mlbf b/third_party/rust/rust_cascade/test_data/test_v1_murmur_short_mlbf Binary files differnew file mode 100644 index 0000000000..d0bb7071ab --- /dev/null +++ b/third_party/rust/rust_cascade/test_data/test_v1_murmur_short_mlbf diff --git a/third_party/rust/rust_cascade/test_data/test_v2_murmur_inverted_mlbf b/third_party/rust/rust_cascade/test_data/test_v2_murmur_inverted_mlbf Binary files differnew file mode 100644 index 0000000000..19acee150e --- /dev/null +++ b/third_party/rust/rust_cascade/test_data/test_v2_murmur_inverted_mlbf diff --git a/third_party/rust/rust_cascade/test_data/test_v2_murmur_mlbf b/third_party/rust/rust_cascade/test_data/test_v2_murmur_mlbf Binary files differnew file mode 100644 index 0000000000..78f527ebf3 --- /dev/null +++ b/third_party/rust/rust_cascade/test_data/test_v2_murmur_mlbf diff --git a/third_party/rust/rust_cascade/test_data/test_v2_sha256ctr_salt_mlbf b/third_party/rust/rust_cascade/test_data/test_v2_sha256ctr_salt_mlbf Binary files differnew file mode 100644 index 0000000000..fca48498d6 --- /dev/null +++ b/third_party/rust/rust_cascade/test_data/test_v2_sha256ctr_salt_mlbf diff --git a/third_party/rust/rust_cascade/test_data/test_v2_sha256l32_inverted_mlbf b/third_party/rust/rust_cascade/test_data/test_v2_sha256l32_inverted_mlbf Binary files differnew file mode 100644 index 0000000000..f9e892ac70 --- /dev/null +++ b/third_party/rust/rust_cascade/test_data/test_v2_sha256l32_inverted_mlbf diff --git a/third_party/rust/rust_cascade/test_data/test_v2_sha256l32_mlbf b/third_party/rust/rust_cascade/test_data/test_v2_sha256l32_mlbf Binary files differnew file mode 100644 index 0000000000..c0d7266fc4 --- /dev/null +++ b/third_party/rust/rust_cascade/test_data/test_v2_sha256l32_mlbf diff --git a/third_party/rust/rust_cascade/test_data/test_v2_sha256l32_salt_mlbf b/third_party/rust/rust_cascade/test_data/test_v2_sha256l32_salt_mlbf Binary files differnew file mode 100644 index 0000000000..4f9c2fff16 --- /dev/null +++ b/third_party/rust/rust_cascade/test_data/test_v2_sha256l32_salt_mlbf |