2 files changed, 1217 insertions, 0 deletions
diff --git a/third_party/rust/prio/benches/cycle_counts.rs b/third_party/rust/prio/benches/cycle_counts.rs
new file mode 100644
index 0000000000..43a4ccdad0
--- /dev/null
+++ b/third_party/rust/prio/benches/cycle_counts.rs
@@ -0,0 +1,341 @@
+#![cfg_attr(windows, allow(dead_code))]
+
+use cfg_if::cfg_if;
+use iai::black_box;
+#[cfg(feature = "experimental")]
+use prio::{
+    codec::{Decode, Encode, ParameterizedDecode},
+    field::{Field255, FieldElement},
+    idpf::{Idpf, IdpfInput, IdpfPublicShare, RingBufferCache},
+    vdaf::{poplar1::Poplar1IdpfValue, xof::Seed},
+};
+#[cfg(feature = "prio2")]
+use prio::{
+    field::FieldPrio2,
+    vdaf::{
+        prio2::{Prio2, Prio2PrepareShare},
+        Aggregator, Share,
+    },
+};
+use prio::{
+    field::{random_vector, Field128, Field64},
+    vdaf::{
+        prio3::{Prio3, Prio3InputShare},
+        Client,
+    },
+};
+
+fn prng(size: usize) -> Vec<Field128> {
+    random_vector(size).unwrap()
+}
+
+fn prng_16() -> Vec<Field128> {
+    prng(16)
+}
+
+fn prng_256() -> Vec<Field128> {
+    prng(256)
+}
+
+fn prng_1024() -> Vec<Field128> {
+    prng(1024)
+}
+
+fn prng_4096() -> Vec<Field128> {
+    prng(4096)
+}
+
+#[cfg(feature = "prio2")]
+fn prio2_client(size: usize) -> Vec<Share<FieldPrio2, 32>> {
+    let prio2 = Prio2::new(size).unwrap();
+    let input = vec![0u32; size];
+    let nonce = [0; 16];
+    prio2.shard(&black_box(input), &black_box(nonce)).unwrap().1
+}
+
+#[cfg(feature = "prio2")]
+fn prio2_client_10() -> Vec<Share<FieldPrio2, 32>> {
+    prio2_client(10)
+}
+
+#[cfg(feature = "prio2")]
+fn prio2_client_100() -> Vec<Share<FieldPrio2, 32>> {
+    prio2_client(100)
+}
+
+#[cfg(feature = "prio2")]
+fn prio2_client_1000() -> Vec<Share<FieldPrio2, 32>> {
+    prio2_client(1000)
+}
+
+#[cfg(feature = "prio2")]
+fn prio2_shard_and_prepare(size: usize) -> Prio2PrepareShare {
+    let prio2 = Prio2::new(size).unwrap();
+    let input = vec![0u32; size];
+    let nonce = [0; 16];
+    let (public_share, input_shares) = prio2.shard(&black_box(input), &black_box(nonce)).unwrap();
+    prio2
+        .prepare_init(&[0; 32], 0, &(), &nonce, &public_share, &input_shares[0])
+        .unwrap()
+        .1
+}
+
+#[cfg(feature = "prio2")]
+fn prio2_shard_and_prepare_10() -> Prio2PrepareShare {
+    prio2_shard_and_prepare(10)
+}
+
+#[cfg(feature = "prio2")]
+fn prio2_shard_and_prepare_100() -> Prio2PrepareShare {
+    prio2_shard_and_prepare(100)
+}
+
+#[cfg(feature = "prio2")]
+fn prio2_shard_and_prepare_1000() -> Prio2PrepareShare {
+    prio2_shard_and_prepare(1000)
+}
+
+fn prio3_client_count() -> Vec<Prio3InputShare<Field64, 16>> {
+    let prio3 = Prio3::new_count(2).unwrap();
+    let measurement = 1;
+    let nonce = [0; 16];
+    prio3
+        .shard(&black_box(measurement), &black_box(nonce))
+        .unwrap()
+        .1
+}
+
+fn prio3_client_histogram_10() -> Vec<Prio3InputShare<Field128, 16>> {
+    let prio3 = Prio3::new_histogram(2, 10, 3).unwrap();
+    let measurement = 9;
+    let nonce = [0; 16];
+    prio3
+        .shard(&black_box(measurement), &black_box(nonce))
+        .unwrap()
+        .1
+}
+
+fn prio3_client_sum_32() -> Vec<Prio3InputShare<Field128, 16>> {
+    let prio3 = Prio3::new_sum(2, 16).unwrap();
+    let measurement = 1337;
+    let nonce = [0; 16];
+    prio3
+        .shard(&black_box(measurement), &black_box(nonce))
+        .unwrap()
+        .1
+}
+
+fn prio3_client_count_vec_1000() -> Vec<Prio3InputShare<Field128, 16>> {
+    let len = 1000;
+    let prio3 = Prio3::new_sum_vec(2, 1, len, 31).unwrap();
+    let measurement = vec![0; len];
+    let nonce = [0; 16];
+    prio3
+        .shard(&black_box(measurement), &black_box(nonce))
+        .unwrap()
+        .1
+}
+
+#[cfg(feature = "multithreaded")]
+fn prio3_client_count_vec_multithreaded_1000() -> Vec<Prio3InputShare<Field128, 16>> {
+    let len = 1000;
+    let prio3 = Prio3::new_sum_vec_multithreaded(2, 1, len, 31).unwrap();
+    let measurement = vec![0; len];
+    let nonce = [0; 16];
+    prio3
+        .shard(&black_box(measurement), &black_box(nonce))
+        .unwrap()
+        .1
+}
+
+#[cfg(feature = "experimental")]
+fn idpf_poplar_gen(
+    input: &IdpfInput,
+    inner_values: Vec<Poplar1IdpfValue<Field64>>,
+    leaf_value: Poplar1IdpfValue<Field255>,
+) {
+    let idpf = Idpf::new((), ());
+    idpf.gen(input, inner_values, leaf_value, &[0; 16]).unwrap();
+}
+
+#[cfg(feature = "experimental")]
+fn idpf_poplar_gen_8() {
+    let input = IdpfInput::from_bytes(b"A");
+    let one = Field64::one();
+    idpf_poplar_gen(
+        &input,
+        vec![Poplar1IdpfValue::new([one, one]); 7],
+        Poplar1IdpfValue::new([Field255::one(), Field255::one()]),
+    );
+}
+
+#[cfg(feature = "experimental")]
+fn idpf_poplar_gen_128() {
+    let input = IdpfInput::from_bytes(b"AAAAAAAAAAAAAAAA");
+    let one = Field64::one();
+    idpf_poplar_gen(
+        &input,
+        vec![Poplar1IdpfValue::new([one, one]); 127],
+        Poplar1IdpfValue::new([Field255::one(), Field255::one()]),
+    );
+}
+
+#[cfg(feature = "experimental")]
+fn idpf_poplar_gen_2048() {
+    let input = IdpfInput::from_bytes(&[0x41; 256]);
+    let one = Field64::one();
+    idpf_poplar_gen(
+        &input,
+        vec![Poplar1IdpfValue::new([one, one]); 2047],
+        Poplar1IdpfValue::new([Field255::one(), Field255::one()]),
+    );
+}
+
+#[cfg(feature = "experimental")]
+fn idpf_poplar_eval(
+    input: &IdpfInput,
+    public_share: &IdpfPublicShare<Poplar1IdpfValue<Field64>, Poplar1IdpfValue<Field255>>,
+    key: &Seed<16>,
+) {
+    let mut cache = RingBufferCache::new(1);
+    let idpf = Idpf::new((), ());
+    idpf.eval(0, public_share, key, input, &[0; 16], &mut cache)
+        .unwrap();
+}
+
+#[cfg(feature = "experimental")]
+fn idpf_poplar_eval_8() {
+    let input = IdpfInput::from_bytes(b"A");
+    let public_share = IdpfPublicShare::get_decoded_with_param(&8, &[0x7f; 306]).unwrap();
+    let key = Seed::get_decoded(&[0xff; 16]).unwrap();
+    idpf_poplar_eval(&input, &public_share, &key);
+}
+
+#[cfg(feature = "experimental")]
+fn idpf_poplar_eval_128() {
+    let input = IdpfInput::from_bytes(b"AAAAAAAAAAAAAAAA");
+    let public_share = IdpfPublicShare::get_decoded_with_param(&128, &[0x7f; 4176]).unwrap();
+    let key = Seed::get_decoded(&[0xff; 16]).unwrap();
+    idpf_poplar_eval(&input, &public_share, &key);
+}
+
+#[cfg(feature = "experimental")]
+fn idpf_poplar_eval_2048() {
+    let input = IdpfInput::from_bytes(&[0x41; 256]);
+    let public_share = IdpfPublicShare::get_decoded_with_param(&2048, &[0x7f; 66096]).unwrap();
+    let key = Seed::get_decoded(&[0xff; 16]).unwrap();
+    idpf_poplar_eval(&input, &public_share, &key);
+}
+
+#[cfg(feature = "experimental")]
+fn idpf_codec() {
+    let data = hex::decode(concat!(
+        "9a",
+        "0000000000000000000000000000000000000000000000",
+        "01eb3a1bd6b5fa4a4500000000000000000000000000000000",
+        "ffffffff0000000022522c3fd5a33cac00000000000000000000000000000000",
+        "ffffffff0000000069f41eee46542b6900000000000000000000000000000000",
+        "00000000000000000000000000000000000000000000000000000000000000",
+        "017d1fd6df94280145a0dcc933ceb706e9219d50e7c4f92fd8ca9a0ffb7d819646",
+    ))
+    .unwrap();
+    let bits = 4;
+    let public_share = IdpfPublicShare::<Poplar1IdpfValue<Field64>, Poplar1IdpfValue<Field255>>::get_decoded_with_param(&bits, &data).unwrap();
+    let encoded = public_share.get_encoded();
+    let _ = black_box(encoded.len());
+}
+
+macro_rules! main_base {
+    ( $( $func_name:ident ),* $(,)* ) => {
+        iai::main!(
+            prng_16,
+            prng_256,
+            prng_1024,
+            prng_4096,
+            prio3_client_count,
+            prio3_client_histogram_10,
+            prio3_client_sum_32,
+            prio3_client_count_vec_1000,
+            $( $func_name, )*
+        );
+    };
+}
+
+#[cfg(feature = "prio2")]
+macro_rules! main_add_prio2 {
+    ( $( $func_name:ident ),* $(,)* ) => {
+        main_base!(
+            prio2_client_10,
+            prio2_client_100,
+            prio2_client_1000,
+            prio2_shard_and_prepare_10,
+            prio2_shard_and_prepare_100,
+            prio2_shard_and_prepare_1000,
+            $( $func_name, )*
+        );
+    };
+}
+
+#[cfg(not(feature = "prio2"))]
+macro_rules! main_add_prio2 {
+    ( $( $func_name:ident ),* $(,)* ) => {
+        main_base!(
+            $( $func_name, )*
+        );
+    };
+}
+
+#[cfg(feature = "multithreaded")]
+macro_rules! main_add_multithreaded {
+    ( $( $func_name:ident ),* $(,)* ) => {
+        main_add_prio2!(
+            prio3_client_count_vec_multithreaded_1000,
+            $( $func_name, )*
+        );
+    };
+}
+
+#[cfg(not(feature = "multithreaded"))]
+macro_rules! main_add_multithreaded {
+    ( $( $func_name:ident ),* $(,)* ) => {
+        main_add_prio2!(
+            $( $func_name, )*
+        );
+    };
+}
+
+#[cfg(feature = "experimental")]
+macro_rules! main_add_experimental {
+    ( $( $func_name:ident ),* $(,)* ) => {
+        main_add_multithreaded!(
+            idpf_codec,
+            idpf_poplar_gen_8,
+            idpf_poplar_gen_128,
+            idpf_poplar_gen_2048,
+            idpf_poplar_eval_8,
+            idpf_poplar_eval_128,
+            idpf_poplar_eval_2048,
+            $( $func_name, )*
+        );
+    };
+}
+
+#[cfg(not(feature = "experimental"))]
+macro_rules! main_add_experimental {
+    ( $( $func_name:ident ),* $(,)* ) => {
+        main_add_multithreaded!(
+            $( $func_name, )*
+        );
+    };
+}
+
+cfg_if! {
+    if #[cfg(windows)] {
+        fn main() {
+            eprintln!("Cycle count benchmarks are not supported on Windows.");
+        }
+    }
+    else {
+        main_add_experimental!();
+    }
+}
diff --git a/third_party/rust/prio/benches/speed_tests.rs b/third_party/rust/prio/benches/speed_tests.rs
new file mode 100644
index 0000000000..66458b1ada
--- /dev/null
+++ b/third_party/rust/prio/benches/speed_tests.rs
@@ -0,0 +1,876 @@
+// SPDX-License-Identifier: MPL-2.0
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+#[cfg(feature = "experimental")]
+use criterion::{BatchSize, Throughput};
+#[cfg(feature = "experimental")]
+use fixed::types::{I1F15, I1F31};
+#[cfg(feature = "experimental")]
+use fixed_macro::fixed;
+#[cfg(feature = "experimental")]
+use num_bigint::BigUint;
+#[cfg(feature = "experimental")]
+use num_rational::Ratio;
+#[cfg(feature = "experimental")]
+use num_traits::ToPrimitive;
+#[cfg(feature = "experimental")]
+use prio::dp::distributions::DiscreteGaussian;
+#[cfg(feature = "prio2")]
+use prio::vdaf::prio2::Prio2;
+use prio::{
+    benchmarked::*,
+    field::{random_vector, Field128 as F, FieldElement},
+    flp::gadgets::Mul,
+    vdaf::{prio3::Prio3, Aggregator, Client},
+};
+#[cfg(feature = "experimental")]
+use prio::{
+    field::{Field255, Field64},
+    flp::types::fixedpoint_l2::FixedPointBoundedL2VecSum,
+    idpf::{Idpf, IdpfInput, RingBufferCache},
+    vdaf::poplar1::{Poplar1, Poplar1AggregationParam, Poplar1IdpfValue},
+};
+#[cfg(feature = "experimental")]
+use rand::prelude::*;
+#[cfg(feature = "experimental")]
+use std::iter;
+use std::time::Duration;
+#[cfg(feature = "experimental")]
+use zipf::ZipfDistribution;
+
+/// Seed for generation of random benchmark inputs.
+///
+/// A fixed RNG seed is used to generate inputs in order to minimize run-to-run variability. The
+/// seed value may be freely changed to get a different set of inputs.
+#[cfg(feature = "experimental")]
+const RNG_SEED: u64 = 0;
+
+/// Speed test for generating a seed and deriving a pseudorandom sequence of field elements.
+fn prng(c: &mut Criterion) {
+    let mut group = c.benchmark_group("rand");
+    let test_sizes = [16, 256, 1024, 4096];
+    for size in test_sizes {
+        group.bench_with_input(BenchmarkId::from_parameter(size), &size, |b, size| {
+            b.iter(|| random_vector::<F>(*size))
+        });
+    }
+    group.finish();
+}
+
+/// Speed test for generating samples from the discrete gaussian distribution using different
+/// standard deviations.
+#[cfg(feature = "experimental")]
+pub fn dp_noise(c: &mut Criterion) {
+    let mut group = c.benchmark_group("dp_noise");
+    let mut rng = StdRng::seed_from_u64(RNG_SEED);
+
+    let test_stds = [
+        Ratio::<BigUint>::from_integer(BigUint::from(u128::MAX)).pow(2),
+        Ratio::<BigUint>::from_integer(BigUint::from(u64::MAX)),
+        Ratio::<BigUint>::from_integer(BigUint::from(u32::MAX)),
+        Ratio::<BigUint>::from_integer(BigUint::from(5u8)),
+        Ratio::<BigUint>::new(BigUint::from(10000u32), BigUint::from(23u32)),
+    ];
+    for std in test_stds {
+        let sampler = DiscreteGaussian::new(std.clone()).unwrap();
+        group.bench_function(
+            BenchmarkId::new("discrete_gaussian", std.to_f64().unwrap_or(f64::INFINITY)),
+            |b| b.iter(|| sampler.sample(&mut rng)),
+        );
+    }
+    group.finish();
+}
+
+/// The asymptotic cost of polynomial multiplication is `O(n log n)` using FFT and `O(n^2)` using
+/// the naive method. This benchmark demonstrates that the latter has better concrete performance
+/// for small polynomials. The result is used to pick the `FFT_THRESHOLD` constant in
+/// `src/flp/gadgets.rs`.
+fn poly_mul(c: &mut Criterion) {
+    let test_sizes = [1_usize, 30, 60, 90, 120, 150];
+
+    let mut group = c.benchmark_group("poly_mul");
+    for size in test_sizes {
+        group.bench_with_input(BenchmarkId::new("fft", size), &size, |b, size| {
+            let m = (size + 1).next_power_of_two();
+            let mut g: Mul<F> = Mul::new(*size);
+            let mut outp = vec![F::zero(); 2 * m];
+            let inp = vec![random_vector(m).unwrap(); 2];
+
+            b.iter(|| {
+                benchmarked_gadget_mul_call_poly_fft(&mut g, &mut outp, &inp).unwrap();
+            })
+        });
+
+        group.bench_with_input(BenchmarkId::new("direct", size), &size, |b, size| {
+            let m = (size + 1).next_power_of_two();
+            let mut g: Mul<F> = Mul::new(*size);
+            let mut outp = vec![F::zero(); 2 * m];
+            let inp = vec![random_vector(m).unwrap(); 2];
+
+            b.iter(|| {
+                benchmarked_gadget_mul_call_poly_direct(&mut g, &mut outp, &inp).unwrap();
+            })
+        });
+    }
+    group.finish();
+}
+
+/// Benchmark prio2.
+#[cfg(feature = "prio2")]
+fn prio2(c: &mut Criterion) {
+    let mut group = c.benchmark_group("prio2_shard");
+    for input_length in [10, 100, 1_000] {
+        group.bench_with_input(
+            BenchmarkId::from_parameter(input_length),
+            &input_length,
+            |b, input_length| {
+                let vdaf = Prio2::new(*input_length).unwrap();
+                let measurement = (0..u32::try_from(*input_length).unwrap())
+                    .map(|i| i & 1)
+                    .collect::<Vec<_>>();
+                let nonce = black_box([0u8; 16]);
+                b.iter(|| vdaf.shard(&measurement, &nonce).unwrap());
+            },
+        );
+    }
+    group.finish();
+
+    let mut group = c.benchmark_group("prio2_prepare_init");
+    for input_length in [10, 100, 1_000] {
+        group.bench_with_input(
+            BenchmarkId::from_parameter(input_length),
+            &input_length,
+            |b, input_length| {
+                let vdaf = Prio2::new(*input_length).unwrap();
+                let measurement = (0..u32::try_from(*input_length).unwrap())
+                    .map(|i| i & 1)
+                    .collect::<Vec<_>>();
+                let nonce = black_box([0u8; 16]);
+                let verify_key = black_box([0u8; 32]);
+                let (public_share, input_shares) = vdaf.shard(&measurement, &nonce).unwrap();
+                b.iter(|| {
+                    vdaf.prepare_init(&verify_key, 0, &(), &nonce, &public_share, &input_shares[0])
+                        .unwrap();
+                });
+            },
+        );
+    }
+    group.finish();
+}
+
+/// Benchmark prio3.
+fn prio3(c: &mut Criterion) {
+    let num_shares = 2;
+
+    c.bench_function("prio3count_shard", |b| {
+        let vdaf = Prio3::new_count(num_shares).unwrap();
+        let measurement = black_box(1);
+        let nonce = black_box([0u8; 16]);
+        b.iter(|| vdaf.shard(&measurement, &nonce).unwrap());
+    });
+
+    c.bench_function("prio3count_prepare_init", |b| {
+        let vdaf = Prio3::new_count(num_shares).unwrap();
+        let measurement = black_box(1);
+        let nonce = black_box([0u8; 16]);
+        let verify_key = black_box([0u8; 16]);
+        let (public_share, input_shares) = vdaf.shard(&measurement, &nonce).unwrap();
+        b.iter(|| {
+            vdaf.prepare_init(&verify_key, 0, &(), &nonce, &public_share, &input_shares[0])
+                .unwrap()
+        });
+    });
+
+    let mut group = c.benchmark_group("prio3sum_shard");
+    for bits in [8, 32] {
+        group.bench_with_input(BenchmarkId::from_parameter(bits), &bits, |b, bits| {
+            let vdaf = Prio3::new_sum(num_shares, *bits).unwrap();
+            let measurement = (1 << bits) - 1;
+            let nonce = black_box([0u8; 16]);
+            b.iter(|| vdaf.shard(&measurement, &nonce).unwrap());
+        });
+    }
+    group.finish();
+
+    let mut group = c.benchmark_group("prio3sum_prepare_init");
+    for bits in [8, 32] {
+        group.bench_with_input(BenchmarkId::from_parameter(bits), &bits, |b, bits| {
+            let vdaf = Prio3::new_sum(num_shares, *bits).unwrap();
+            let measurement = (1 << bits) - 1;
+            let nonce = black_box([0u8; 16]);
+            let verify_key = black_box([0u8; 16]);
+            let (public_share, input_shares) = vdaf.shard(&measurement, &nonce).unwrap();
+            b.iter(|| {
+                vdaf.prepare_init(&verify_key, 0, &(), &nonce, &public_share, &input_shares[0])
+                    .unwrap()
+            });
+        });
+    }
+    group.finish();
+
+    let mut group = c.benchmark_group("prio3sumvec_shard");
+    for (input_length, chunk_length) in [(10, 3), (100, 10), (1_000, 31)] {
+        group.bench_with_input(
+            BenchmarkId::new("serial", input_length),
+            &(input_length, chunk_length),
+            |b, (input_length, chunk_length)| {
+                let vdaf = Prio3::new_sum_vec(num_shares, 1, *input_length, *chunk_length).unwrap();
+                let measurement = (0..u128::try_from(*input_length).unwrap())
+                    .map(|i| i & 1)
+                    .collect::<Vec<_>>();
+                let nonce = black_box([0u8; 16]);
+                b.iter(|| vdaf.shard(&measurement, &nonce).unwrap());
+            },
+        );
+    }
+
+    #[cfg(feature = "multithreaded")]
+    {
+        for (input_length, chunk_length) in [(10, 3), (100, 10), (1_000, 31)] {
+            group.bench_with_input(
+                BenchmarkId::new("parallel", input_length),
+                &(input_length, chunk_length),
+                |b, (input_length, chunk_length)| {
+                    let vdaf = Prio3::new_sum_vec_multithreaded(
+                        num_shares,
+                        1,
+                        *input_length,
+                        *chunk_length,
+                    )
+                    .unwrap();
+                    let measurement = (0..u128::try_from(*input_length).unwrap())
+                        .map(|i| i & 1)
+                        .collect::<Vec<_>>();
+                    let nonce = black_box([0u8; 16]);
+                    b.iter(|| vdaf.shard(&measurement, &nonce).unwrap());
+                },
+            );
+        }
+    }
+    group.finish();
+
+    let mut group = c.benchmark_group("prio3sumvec_prepare_init");
+    for (input_length, chunk_length) in [(10, 3), (100, 10), (1_000, 31)] {
+        group.bench_with_input(
+            BenchmarkId::new("serial", input_length),
+            &(input_length, chunk_length),
+            |b, (input_length, chunk_length)| {
+                let vdaf = Prio3::new_sum_vec(num_shares, 1, *input_length, *chunk_length).unwrap();
+                let measurement = (0..u128::try_from(*input_length).unwrap())
+                    .map(|i| i & 1)
+                    .collect::<Vec<_>>();
+                let nonce = black_box([0u8; 16]);
+                let verify_key = black_box([0u8; 16]);
+                let (public_share, input_shares) = vdaf.shard(&measurement, &nonce).unwrap();
+                b.iter(|| {
+                    vdaf.prepare_init(&verify_key, 0, &(), &nonce, &public_share, &input_shares[0])
+                        .unwrap()
+                });
+            },
+        );
+    }
+
+    #[cfg(feature = "multithreaded")]
+    {
+        for (input_length, chunk_length) in [(10, 3), (100, 10), (1_000, 31)] {
+            group.bench_with_input(
+                BenchmarkId::new("parallel", input_length),
+                &(input_length, chunk_length),
+                |b, (input_length, chunk_length)| {
+                    let vdaf = Prio3::new_sum_vec_multithreaded(
+                        num_shares,
+                        1,
+                        *input_length,
+                        *chunk_length,
+                    )
+                    .unwrap();
+                    let measurement = (0..u128::try_from(*input_length).unwrap())
+                        .map(|i| i & 1)
+                        .collect::<Vec<_>>();
+                    let nonce = black_box([0u8; 16]);
+                    let verify_key = black_box([0u8; 16]);
+                    let (public_share, input_shares) = vdaf.shard(&measurement, &nonce).unwrap();
+                    b.iter(|| {
+                        vdaf.prepare_init(
+                            &verify_key,
+                            0,
+                            &(),
+                            &nonce,
+                            &public_share,
+                            &input_shares[0],
+                        )
+                        .unwrap()
+                    });
+                },
+            );
+        }
+    }
+    group.finish();
+
+    let mut group = c.benchmark_group("prio3histogram_shard");
+    for (input_length, chunk_length) in [
+        (10, 3),
+        (100, 10),
+        (1_000, 31),
+        (10_000, 100),
+        (100_000, 316),
+    ] {
+        if input_length >= 100_000 {
+            group.measurement_time(Duration::from_secs(15));
+        }
+        group.bench_with_input(
+            BenchmarkId::new("serial", input_length),
+            &(input_length, chunk_length),
+            |b, (input_length, chunk_length)| {
+                let vdaf = Prio3::new_histogram(num_shares, *input_length, *chunk_length).unwrap();
+                let measurement = black_box(0);
+                let nonce = black_box([0u8; 16]);
+                b.iter(|| vdaf.shard(&measurement, &nonce).unwrap());
+            },
+        );
+    }
+
+    #[cfg(feature = "multithreaded")]
+    {
+        for (input_length, chunk_length) in [
+            (10, 3),
+            (100, 10),
+            (1_000, 31),
+            (10_000, 100),
+            (100_000, 316),
+        ] {
+            if input_length >= 100_000 {
+                group.measurement_time(Duration::from_secs(15));
+            }
+            group.bench_with_input(
+                BenchmarkId::new("parallel", input_length),
+                &(input_length, chunk_length),
+                |b, (input_length, chunk_length)| {
+                    let vdaf = Prio3::new_histogram_multithreaded(
+                        num_shares,
+                        *input_length,
+                        *chunk_length,
+                    )
+                    .unwrap();
+                    let measurement = black_box(0);
+                    let nonce = black_box([0u8; 16]);
+                    b.iter(|| vdaf.shard(&measurement, &nonce).unwrap());
+                },
+            );
+        }
+    }
+    group.finish();
+
+    let mut group = c.benchmark_group("prio3histogram_prepare_init");
+    for (input_length, chunk_length) in [
+        (10, 3),
+        (100, 10),
+        (1_000, 31),
+        (10_000, 100),
+        (100_000, 316),
+    ] {
+        if input_length >= 100_000 {
+            group.measurement_time(Duration::from_secs(15));
+        }
+        group.bench_with_input(
+            BenchmarkId::new("serial", input_length),
+            &(input_length, chunk_length),
+            |b, (input_length, chunk_length)| {
+                let vdaf = Prio3::new_histogram(num_shares, *input_length, *chunk_length).unwrap();
+                let measurement = black_box(0);
+                let nonce = black_box([0u8; 16]);
+                let verify_key = black_box([0u8; 16]);
+                let (public_share, input_shares) = vdaf.shard(&measurement, &nonce).unwrap();
+                b.iter(|| {
+                    vdaf.prepare_init(&verify_key, 0, &(), &nonce, &public_share, &input_shares[0])
+                        .unwrap()
+                });
+            },
+        );
+    }
+
+    #[cfg(feature = "multithreaded")]
+    {
+        for (input_length, chunk_length) in [
+            (10, 3),
+            (100, 10),
+            (1_000, 31),
+            (10_000, 100),
+            (100_000, 316),
+        ] {
+            if input_length >= 100_000 {
+                group.measurement_time(Duration::from_secs(15));
+            }
+            group.bench_with_input(
+                BenchmarkId::new("parallel", input_length),
+                &(input_length, chunk_length),
+                |b, (input_length, chunk_length)| {
+                    let vdaf = Prio3::new_histogram_multithreaded(
+                        num_shares,
+                        *input_length,
+                        *chunk_length,
+                    )
+                    .unwrap();
+                    let measurement = black_box(0);
+                    let nonce = black_box([0u8; 16]);
+                    let verify_key = black_box([0u8; 16]);
+                    let (public_share, input_shares) = vdaf.shard(&measurement, &nonce).unwrap();
+                    b.iter(|| {
+                        vdaf.prepare_init(
+                            &verify_key,
+                            0,
+                            &(),
+                            &nonce,
+                            &public_share,
+                            &input_shares[0],
+                        )
+                        .unwrap()
+                    });
+                },
+            );
+        }
+    }
+    group.finish();
+
+    #[cfg(feature = "experimental")]
+    {
+        let mut group = c.benchmark_group("prio3fixedpointboundedl2vecsum_i1f15_shard");
+        for dimension in [10, 100, 1_000] {
+            group.bench_with_input(
+                BenchmarkId::new("serial", dimension),
+                &dimension,
+                |b, dimension| {
+                    let vdaf: Prio3<FixedPointBoundedL2VecSum<I1F15, _, _>, _, 16> =
+                        Prio3::new_fixedpoint_boundedl2_vec_sum(num_shares, *dimension).unwrap();
+                    let mut measurement = vec![fixed!(0: I1F15); *dimension];
+                    measurement[0] = fixed!(0.5: I1F15);
+                    let nonce = black_box([0u8; 16]);
+                    b.iter(|| vdaf.shard(&measurement, &nonce).unwrap());
+                },
+            );
+        }
+
+        #[cfg(feature = "multithreaded")]
+        {
+            for dimension in [10, 100, 1_000] {
+                group.bench_with_input(
+                    BenchmarkId::new("parallel", dimension),
+                    &dimension,
+                    |b, dimension| {
+                        let vdaf: Prio3<FixedPointBoundedL2VecSum<I1F15, _, _>, _, 16> =
+                            Prio3::new_fixedpoint_boundedl2_vec_sum_multithreaded(
+                                num_shares, *dimension,
+                            )
+                            .unwrap();
+                        let mut measurement = vec![fixed!(0: I1F15); *dimension];
+                        measurement[0] = fixed!(0.5: I1F15);
+                        let nonce = black_box([0u8; 16]);
+                        b.iter(|| vdaf.shard(&measurement, &nonce).unwrap());
+                    },
+                );
+            }
+        }
+        group.finish();
+
+        let mut group = c.benchmark_group("prio3fixedpointboundedl2vecsum_i1f15_prepare_init");
+        for dimension in [10, 100, 1_000] {
+            group.bench_with_input(
+                BenchmarkId::new("series", dimension),
+                &dimension,
+                |b, dimension| {
+                    let vdaf: Prio3<FixedPointBoundedL2VecSum<I1F15, _, _>, _, 16> =
+                        Prio3::new_fixedpoint_boundedl2_vec_sum(num_shares, *dimension).unwrap();
+                    let mut measurement = vec![fixed!(0: I1F15); *dimension];
+                    measurement[0] = fixed!(0.5: I1F15);
+                    let nonce = black_box([0u8; 16]);
+                    let verify_key = black_box([0u8; 16]);
+                    let (public_share, input_shares) = vdaf.shard(&measurement, &nonce).unwrap();
+                    b.iter(|| {
+                        vdaf.prepare_init(
+                            &verify_key,
+                            0,
+                            &(),
+                            &nonce,
+                            &public_share,
+                            &input_shares[0],
+                        )
+                        .unwrap()
+                    });
+                },
+            );
+        }
+
+        #[cfg(feature = "multithreaded")]
+        {
+            for dimension in [10, 100, 1_000] {
+                group.bench_with_input(
+                    BenchmarkId::new("parallel", dimension),
+                    &dimension,
+                    |b, dimension| {
+                        let vdaf: Prio3<FixedPointBoundedL2VecSum<I1F15, _, _>, _, 16> =
+                            Prio3::new_fixedpoint_boundedl2_vec_sum_multithreaded(
+                                num_shares, *dimension,
+                            )
+                            .unwrap();
+                        let mut measurement = vec![fixed!(0: I1F15); *dimension];
+                        measurement[0] = fixed!(0.5: I1F15);
+                        let nonce = black_box([0u8; 16]);
+                        let verify_key = black_box([0u8; 16]);
+                        let (public_share, input_shares) =
+                            vdaf.shard(&measurement, &nonce).unwrap();
+                        b.iter(|| {
+                            vdaf.prepare_init(
+                                &verify_key,
+                                0,
+                                &(),
+                                &nonce,
+                                &public_share,
+                                &input_shares[0],
+                            )
+                            .unwrap()
+                        });
+                    },
+                );
+            }
+        }
+        group.finish();
+
+        let mut group = c.benchmark_group("prio3fixedpointboundedl2vecsum_i1f31_shard");
+        for dimension in [10, 100, 1_000] {
+            group.bench_with_input(
+                BenchmarkId::new("serial", dimension),
+                &dimension,
+                |b, dimension| {
+                    let vdaf: Prio3<FixedPointBoundedL2VecSum<I1F31, _, _>, _, 16> =
+                        Prio3::new_fixedpoint_boundedl2_vec_sum(num_shares, *dimension).unwrap();
+                    let mut measurement = vec![fixed!(0: I1F31); *dimension];
+                    measurement[0] = fixed!(0.5: I1F31);
+                    let nonce = black_box([0u8; 16]);
+                    b.iter(|| vdaf.shard(&measurement, &nonce).unwrap());
+                },
+            );
+        }
+
+        #[cfg(feature = "multithreaded")]
+        {
+            for dimension in [10, 100, 1_000] {
+                group.bench_with_input(
+                    BenchmarkId::new("parallel", dimension),
+                    &dimension,
+                    |b, dimension| {
+                        let vdaf: Prio3<FixedPointBoundedL2VecSum<I1F31, _, _>, _, 16> =
+                            Prio3::new_fixedpoint_boundedl2_vec_sum_multithreaded(
+                                num_shares, *dimension,
+                            )
+                            .unwrap();
+                        let mut measurement = vec![fixed!(0: I1F31); *dimension];
+                        measurement[0] = fixed!(0.5: I1F31);
+                        let nonce = black_box([0u8; 16]);
+                        b.iter(|| vdaf.shard(&measurement, &nonce).unwrap());
+                    },
+                );
+            }
+        }
+        group.finish();
+
+        let mut group = c.benchmark_group("prio3fixedpointboundedl2vecsum_i1f31_prepare_init");
+        for dimension in [10, 100, 1_000] {
+            group.bench_with_input(
+                BenchmarkId::new("series", dimension),
+                &dimension,
+                |b, dimension| {
+                    let vdaf: Prio3<FixedPointBoundedL2VecSum<I1F31, _, _>, _, 16> =
+                        Prio3::new_fixedpoint_boundedl2_vec_sum(num_shares, *dimension).unwrap();
+                    let mut measurement = vec![fixed!(0: I1F31); *dimension];
+                    measurement[0] = fixed!(0.5: I1F31);
+                    let nonce = black_box([0u8; 16]);
+                    let verify_key = black_box([0u8; 16]);
+                    let (public_share, input_shares) = vdaf.shard(&measurement, &nonce).unwrap();
+                    b.iter(|| {
+                        vdaf.prepare_init(
+                            &verify_key,
+                            0,
+                            &(),
+                            &nonce,
+                            &public_share,
+                            &input_shares[0],
+                        )
+                        .unwrap()
+                    });
+                },
+            );
+        }
+
+        #[cfg(feature = "multithreaded")]
+        {
+            for dimension in [10, 100, 1_000] {
+                group.bench_with_input(
+                    BenchmarkId::new("parallel", dimension),
+                    &dimension,
+                    |b, dimension| {
+                        let vdaf: Prio3<FixedPointBoundedL2VecSum<I1F31, _, _>, _, 16> =
+                            Prio3::new_fixedpoint_boundedl2_vec_sum_multithreaded(
+                                num_shares, *dimension,
+                            )
+                            .unwrap();
+                        let mut measurement = vec![fixed!(0: I1F31); *dimension];
+                        measurement[0] = fixed!(0.5: I1F31);
+                        let nonce = black_box([0u8; 16]);
+                        let verify_key = black_box([0u8; 16]);
+                        let (public_share, input_shares) =
+                            vdaf.shard(&measurement, &nonce).unwrap();
+                        b.iter(|| {
+                            vdaf.prepare_init(
+                                &verify_key,
+                                0,
+                                &(),
+                                &nonce,
+                                &public_share,
+                                &input_shares[0],
+                            )
+                            .unwrap()
+                        });
+                    },
+                );
+            }
+        }
+        group.finish();
+    }
+}
+
+/// Benchmark IdpfPoplar performance.
+#[cfg(feature = "experimental")]
+fn idpf(c: &mut Criterion) {
+    let test_sizes = [8usize, 8 * 16, 8 * 256];
+
+    let mut group = c.benchmark_group("idpf_gen");
+    for size in test_sizes.iter() {
+        group.throughput(Throughput::Bytes(*size as u64 / 8));
+        group.bench_with_input(BenchmarkId::from_parameter(size), size, |b, &size| {
+            let bits = iter::repeat_with(random).take(size).collect::<Vec<bool>>();
+            let input = IdpfInput::from_bools(&bits);
+
+            let inner_values = random_vector::<Field64>(size - 1)
+                .unwrap()
+                .into_iter()
+                .map(|random_element| Poplar1IdpfValue::new([Field64::one(), random_element]))
+                .collect::<Vec<_>>();
+            let leaf_value = Poplar1IdpfValue::new([Field255::one(), random_vector(1).unwrap()[0]]);
+
+            let idpf = Idpf::new((), ());
+            b.iter(|| {
+                idpf.gen(&input, inner_values.clone(), leaf_value, &[0; 16])
+                    .unwrap();
+            });
+        });
+    }
+    group.finish();
+
+    let mut group = c.benchmark_group("idpf_eval");
+    for size in test_sizes.iter() {
+        group.throughput(Throughput::Bytes(*size as u64 / 8));
+        group.bench_with_input(BenchmarkId::from_parameter(size), size, |b, &size| {
+            let bits = iter::repeat_with(random).take(size).collect::<Vec<bool>>();
+            let input = IdpfInput::from_bools(&bits);
+
+            let inner_values = random_vector::<Field64>(size - 1)
+                .unwrap()
+                .into_iter()
+                .map(|random_element| Poplar1IdpfValue::new([Field64::one(), random_element]))
+                .collect::<Vec<_>>();
+            let leaf_value = Poplar1IdpfValue::new([Field255::one(), random_vector(1).unwrap()[0]]);
+
+            let idpf = Idpf::new((), ());
+            let (public_share, keys) = idpf
+                .gen(&input, inner_values, leaf_value, &[0; 16])
+                .unwrap();
+
+            b.iter(|| {
+                // This is an aggressively small cache, to minimize its impact on the benchmark.
+                // In this synthetic benchmark, we are only checking one candidate prefix per level
+                // (typically there are many candidate prefixes per level) so the cache hit rate
+                // will be unaffected.
+                let mut cache = RingBufferCache::new(1);
+
+                for prefix_length in 1..=size {
+                    let prefix = input[..prefix_length].to_owned().into();
+                    idpf.eval(0, &public_share, &keys[0], &prefix, &[0; 16], &mut cache)
+                        .unwrap();
+                }
+            });
+        });
+    }
+    group.finish();
+}
+
+/// Benchmark Poplar1.
+#[cfg(feature = "experimental")]
+fn poplar1(c: &mut Criterion) {
+    let test_sizes = [16_usize, 128, 256];
+
+    let mut group = c.benchmark_group("poplar1_shard");
+    for size in test_sizes.iter() {
+        group.throughput(Throughput::Bytes(*size as u64 / 8));
+        group.bench_with_input(BenchmarkId::from_parameter(size), size, |b, &size| {
+            let vdaf = Poplar1::new_shake128(size);
+            let mut rng = StdRng::seed_from_u64(RNG_SEED);
+            let nonce = rng.gen::<[u8; 16]>();
+
+            b.iter_batched(
+                || {
+                    let bits = iter::repeat_with(|| rng.gen())
+                        .take(size)
+                        .collect::<Vec<bool>>();
+                    IdpfInput::from_bools(&bits)
+                },
+                |measurement| {
+                    vdaf.shard(&measurement, &nonce).unwrap();
+                },
+                BatchSize::SmallInput,
+            );
+        });
+    }
+    group.finish();
+
+    let mut group = c.benchmark_group("poplar1_prepare_init");
+    for size in test_sizes.iter() {
+        group.measurement_time(Duration::from_secs(30)); // slower benchmark
+        group.bench_with_input(BenchmarkId::from_parameter(size), size, |b, &size| {
+            let vdaf = Poplar1::new_shake128(size);
+            let mut rng = StdRng::seed_from_u64(RNG_SEED);
+
+            b.iter_batched(
+                || {
+                    let verify_key: [u8; 16] = rng.gen();
+                    let nonce: [u8; 16] = rng.gen();
+
+                    // Parameters are chosen to match Chris Wood's experimental setup:
+                    // https://github.com/chris-wood/heavy-hitter-comparison
+                    let (measurements, prefix_tree) = poplar1_generate_zipf_distributed_batch(
+                        &mut rng, // rng
+                        size,     // bits
+                        10,       // threshold
+                        1000,     // number of measurements
+                        128,      // Zipf support
+                        1.03,     // Zipf exponent
+                    );
+
+                    // We are benchmarking preparation of a single report. For this test, it doesn't matter
+                    // which measurement we generate a report for, so pick the first measurement
+                    // arbitrarily.
+                    let (public_share, input_shares) =
+                        vdaf.shard(&measurements[0], &nonce).unwrap();
+
+                    // For the aggregation paramter, we use the candidate prefixes from the prefix tree
+                    // for the sampled measurements. Run preparation for the last step, which ought to
+                    // represent the worst-case performance.
+                    let agg_param =
+                        Poplar1AggregationParam::try_from_prefixes(prefix_tree[size - 1].clone())
+                            .unwrap();
+
+                    (
+                        verify_key,
+                        nonce,
+                        agg_param,
+                        public_share,
+                        input_shares.into_iter().next().unwrap(),
+                    )
+                },
+                |(verify_key, nonce, agg_param, public_share, input_share)| {
+                    vdaf.prepare_init(
+                        &verify_key,
+                        0,
+                        &agg_param,
+                        &nonce,
+                        &public_share,
+                        &input_share,
+                    )
+                    .unwrap();
+                },
+                BatchSize::SmallInput,
+            );
+        });
+    }
+    group.finish();
+}
+
+/// Generate a set of Poplar1 measurements with the given bit length `bits`. They are sampled
+/// according to the Zipf distribution with parameters `zipf_support` and `zipf_exponent`. Return
+/// the measurements, along with the prefix tree for the desired threshold.
+///
+/// The prefix tree consists of a sequence of candidate prefixes for each level. For a given level,
+/// the candidate prefixes are computed from the hit counts of the prefixes at the previous level:
+/// For any prefix `p` whose hit count is at least the desired threshold, add `p || 0` and `p || 1`
+/// to the list.
+#[cfg(feature = "experimental")]
+fn poplar1_generate_zipf_distributed_batch(
+    rng: &mut impl Rng,
+    bits: usize,
+    threshold: usize,
+    measurement_count: usize,
+    zipf_support: usize,
+    zipf_exponent: f64,
+) -> (Vec<IdpfInput>, Vec<Vec<IdpfInput>>) {
+    // Generate random inputs.
+    let mut inputs = Vec::with_capacity(zipf_support);
+    for _ in 0..zipf_support {
+        let bools: Vec<bool> = (0..bits).map(|_| rng.gen()).collect();
+        inputs.push(IdpfInput::from_bools(&bools));
+    }
+
+    // Sample a number of inputs according to the Zipf distribution.
+    let mut samples = Vec::with_capacity(measurement_count);
+    let zipf = ZipfDistribution::new(zipf_support, zipf_exponent).unwrap();
+    for _ in 0..measurement_count {
+        samples.push(inputs[zipf.sample(rng) - 1].clone());
+    }
+
+    // Compute the prefix tree for the desired threshold.
+    let mut prefix_tree = Vec::with_capacity(bits);
+    prefix_tree.push(vec![
+        IdpfInput::from_bools(&[false]),
+        IdpfInput::from_bools(&[true]),
+    ]);
+
+    for level in 0..bits - 1 {
+        // Compute the hit count of each prefix from the previous level.
+        let mut hit_counts = vec![0; prefix_tree[level].len()];
+        for (hit_count, prefix) in hit_counts.iter_mut().zip(prefix_tree[level].iter()) {
+            for sample in samples.iter() {
+                let mut is_prefix = true;
+                for j in 0..prefix.len() {
+                    if prefix[j] != sample[j] {
+                        is_prefix = false;
+                        break;
+                    }
+                }
+                if is_prefix {
+                    *hit_count += 1;
+                }
+            }
+        }
+
+        // Compute the next set of candidate prefixes.
+        let mut next_prefixes = Vec::new();
+        for (hit_count, prefix) in hit_counts.iter().zip(prefix_tree[level].iter()) {
+            if *hit_count >= threshold {
+                next_prefixes.push(prefix.clone_with_suffix(&[false]));
+                next_prefixes.push(prefix.clone_with_suffix(&[true]));
+            }
+        }
+        prefix_tree.push(next_prefixes);
+    }
+
+    (samples, prefix_tree)
+}
+
+#[cfg(all(feature = "prio2", feature = "experimental"))]
+criterion_group!(benches, poplar1, prio3, prio2, poly_mul, prng, idpf, dp_noise);
+#[cfg(all(not(feature = "prio2"), feature = "experimental"))]
+criterion_group!(benches, poplar1, prio3, poly_mul, prng, idpf, dp_noise);
+#[cfg(all(feature = "prio2", not(feature = "experimental")))]
+criterion_group!(benches, prio3, prio2, prng, poly_mul);
+#[cfg(all(not(feature = "prio2"), not(feature = "experimental")))]
+criterion_group!(benches, prio3, prng, poly_mul);
+
+criterion_main!(benches);