|
| 1 | +//! Shared data generators, codec helpers, and pre-computed fixtures used by |
| 2 | +//! both the Criterion benchmark (`fastpfor_benchmark.rs`) and the smoke-test |
| 3 | +//! suite (`tests/benchmark_smoke.rs`). |
| 4 | +//! |
| 5 | +//! Loaded as a module via `#[path]` in both consumers, so every item consumed |
| 6 | +//! from outside must be `pub`. |
| 7 | +
|
| 8 | +// This is an internal dev-only module; doc-comments on every field would add |
| 9 | +// noise without benefit. |
| 10 | +#![allow(missing_docs)] |
| 11 | + |
| 12 | +use core::ops::Range; |
| 13 | +pub use std::io::Cursor; |
| 14 | +use std::num::NonZeroU32; |
| 15 | + |
| 16 | +pub use fastpfor::rust::{BLOCK_SIZE_128, BLOCK_SIZE_256, DEFAULT_PAGE_SIZE, FastPFOR, Integer}; |
| 17 | +use rand::rngs::StdRng; |
| 18 | +use rand::{RngExt as _, SeedableRng}; |
| 19 | + |
| 20 | +const SEED: u64 = 456; |
| 21 | + |
| 22 | +// --------------------------------------------------------------------------- |
| 23 | +// Data generators (private — only used to build fixtures) |
| 24 | +// --------------------------------------------------------------------------- |
| 25 | + |
| 26 | +type DataGeneratorFn = fn(usize) -> Vec<u32>; |
| 27 | + |
| 28 | +fn generate_uniform_data_from_range(size: usize, value_range: Range<u32>) -> Vec<u32> { |
| 29 | + let mut rng = StdRng::seed_from_u64(SEED); |
| 30 | + (0..size) |
| 31 | + .map(|_| rng.random_range(value_range.clone())) |
| 32 | + .collect() |
| 33 | +} |
| 34 | + |
| 35 | +pub fn generate_uniform_data_small_value_distribution(size: usize) -> Vec<u32> { |
| 36 | + generate_uniform_data_from_range(size, 0..1000) |
| 37 | +} |
| 38 | + |
| 39 | +fn generate_uniform_data_large_value_distribution(size: usize) -> Vec<u32> { |
| 40 | + generate_uniform_data_from_range(size, 0..u32::MAX) |
| 41 | +} |
| 42 | + |
| 43 | +fn generate_clustered_data(size: usize) -> Vec<u32> { |
| 44 | + let mut rng = StdRng::seed_from_u64(SEED); |
| 45 | + let mut base = 0u32; |
| 46 | + (0..size) |
| 47 | + .map(|_| { |
| 48 | + if rng.random_bool(0.1) { |
| 49 | + base = rng.random_range(0..1000); |
| 50 | + } |
| 51 | + base + rng.random_range(0..10) |
| 52 | + }) |
| 53 | + .collect() |
| 54 | +} |
| 55 | + |
| 56 | +fn generate_sequential_data(size: usize) -> Vec<u32> { |
| 57 | + (0..size as u32).collect() |
| 58 | +} |
| 59 | + |
| 60 | +fn generate_sparse_data(size: usize) -> Vec<u32> { |
| 61 | + let mut rng = StdRng::seed_from_u64(SEED); |
| 62 | + (0..size) |
| 63 | + .map(|_| { |
| 64 | + if rng.random_bool(0.9) { |
| 65 | + 0 |
| 66 | + } else { |
| 67 | + rng.random() |
| 68 | + } |
| 69 | + }) |
| 70 | + .collect() |
| 71 | +} |
| 72 | + |
| 73 | +fn generate_constant_data(size: usize) -> Vec<u32> { |
| 74 | + vec![SEED as u32; size] |
| 75 | +} |
| 76 | + |
| 77 | +fn generate_geometric_data(size: usize) -> Vec<u32> { |
| 78 | + (0..size).map(|i| 1u32 << (i % 30)).collect() |
| 79 | +} |
| 80 | + |
| 81 | +/// Patterns used by compression / decompression / roundtrip / block-size benchmarks. |
| 82 | +const COMPRESS_PATTERNS: &[(&str, DataGeneratorFn)] = &[ |
| 83 | + ( |
| 84 | + "uniform_small_value_distribution", |
| 85 | + generate_uniform_data_small_value_distribution, |
| 86 | + ), |
| 87 | + ( |
| 88 | + "uniform_large_value_distribution", |
| 89 | + generate_uniform_data_large_value_distribution, |
| 90 | + ), |
| 91 | + ("clustered", generate_clustered_data), |
| 92 | + ("sequential", generate_sequential_data), |
| 93 | + ("sparse", generate_sparse_data), |
| 94 | +]; |
| 95 | + |
| 96 | +/// Superset of `COMPRESS_PATTERNS`, also used by the compression-ratio benchmark. |
| 97 | +const ALL_PATTERNS: &[(&str, DataGeneratorFn)] = &[ |
| 98 | + ( |
| 99 | + "uniform_small_distribution", |
| 100 | + generate_uniform_data_small_value_distribution, |
| 101 | + ), |
| 102 | + ( |
| 103 | + "uniform_large_distribution", |
| 104 | + generate_uniform_data_large_value_distribution, |
| 105 | + ), |
| 106 | + ("clustered", generate_clustered_data), |
| 107 | + ("sequential", generate_sequential_data), |
| 108 | + ("sparse", generate_sparse_data), |
| 109 | + ("constant", generate_constant_data), |
| 110 | + ("geometric", generate_geometric_data), |
| 111 | +]; |
| 112 | + |
| 113 | +// --------------------------------------------------------------------------- |
| 114 | +// Codec helpers |
| 115 | +// --------------------------------------------------------------------------- |
| 116 | + |
| 117 | +/// Compress `data` and return the compressed words. |
| 118 | +pub fn compress_data(codec: &mut FastPFOR, data: &[u32]) -> Vec<u32> { |
| 119 | + let mut compressed = vec![0u32; data.len() * 2 + 1024]; |
| 120 | + let mut input_offset = Cursor::new(0); |
| 121 | + let mut output_offset = Cursor::new(0); |
| 122 | + codec |
| 123 | + .compress( |
| 124 | + data, |
| 125 | + data.len() as u32, |
| 126 | + &mut input_offset, |
| 127 | + &mut compressed, |
| 128 | + &mut output_offset, |
| 129 | + ) |
| 130 | + .unwrap(); |
| 131 | + let len = output_offset.position() as usize; |
| 132 | + compressed.truncate(len); |
| 133 | + compressed |
| 134 | +} |
| 135 | + |
| 136 | +/// Decompress `compressed` into the caller-provided `decompressed` buffer and |
| 137 | +/// return the number of elements written. |
| 138 | +/// |
| 139 | +/// The buffer must be allocated outside the timed loop so that allocation cost |
| 140 | +/// is not measured. |
| 141 | +pub fn decompress_data( |
| 142 | + codec: &mut FastPFOR, |
| 143 | + compressed: &[u32], |
| 144 | + decompressed: &mut [u32], |
| 145 | +) -> usize { |
| 146 | + let mut input_offset = Cursor::new(0); |
| 147 | + let mut output_offset = Cursor::new(0); |
| 148 | + codec |
| 149 | + .uncompress( |
| 150 | + compressed, |
| 151 | + compressed.len() as u32, |
| 152 | + &mut input_offset, |
| 153 | + decompressed, |
| 154 | + &mut output_offset, |
| 155 | + ) |
| 156 | + .unwrap(); |
| 157 | + output_offset.position() as usize |
| 158 | +} |
| 159 | + |
| 160 | +/// Pre-compress `data` with a specific `block_size` and return the compressed buffer. |
| 161 | +fn prepare_compressed_data(data: &[u32], block_size: NonZeroU32) -> Vec<u32> { |
| 162 | + compress_data(&mut FastPFOR::new(DEFAULT_PAGE_SIZE, block_size), data) |
| 163 | +} |
| 164 | + |
| 165 | +// --------------------------------------------------------------------------- |
| 166 | +// C++ helpers (compiled only when the `cpp` feature is active) |
| 167 | +// --------------------------------------------------------------------------- |
| 168 | + |
| 169 | +#[cfg(feature = "cpp")] |
| 170 | +pub fn cpp_encode(codec: &fastpfor::cpp::FastPFor128Codec, data: &[u32]) -> Vec<u32> { |
| 171 | + use fastpfor::cpp::Codec32 as _; |
| 172 | + let mut out = vec![0u32; data.len() * 2 + 1024]; |
| 173 | + let new_len = codec.encode32(data, &mut out).unwrap().len(); |
| 174 | + out.truncate(new_len); |
| 175 | + out |
| 176 | +} |
| 177 | + |
| 178 | +#[cfg(feature = "cpp")] |
| 179 | +pub fn cpp_decode( |
| 180 | + codec: &fastpfor::cpp::FastPFor128Codec, |
| 181 | + compressed: &[u32], |
| 182 | + decompressed: &mut [u32], |
| 183 | +) -> usize { |
| 184 | + use fastpfor::cpp::Codec32 as _; |
| 185 | + codec.decode32(compressed, decompressed).unwrap().len() |
| 186 | +} |
| 187 | + |
| 188 | +// --------------------------------------------------------------------------- |
| 189 | +// Pre-computed fixtures |
| 190 | +// --------------------------------------------------------------------------- |
| 191 | + |
| 192 | +/// One row of pre-computed data for compression / decompression benchmarks. |
| 193 | +pub struct CompressFixture { |
| 194 | + pub name: &'static str, |
| 195 | + pub data: Vec<u32>, |
| 196 | + /// Rust-compressed form (`BLOCK_SIZE_128`), ready for decompression benchmarks. |
| 197 | + pub rust_compressed: Vec<u32>, |
| 198 | +} |
| 199 | + |
| 200 | +impl CompressFixture { |
| 201 | + fn new(name: &'static str, generator: DataGeneratorFn, size: usize) -> Self { |
| 202 | + let data = generator(size); |
| 203 | + let rust_compressed = prepare_compressed_data(&data, BLOCK_SIZE_128); |
| 204 | + Self { |
| 205 | + name, |
| 206 | + data, |
| 207 | + rust_compressed, |
| 208 | + } |
| 209 | + } |
| 210 | +} |
| 211 | + |
| 212 | +/// Build fixtures for every `COMPRESS_PATTERNS × sizes` combination. |
| 213 | +pub fn compress_fixtures(sizes: &[usize]) -> Vec<(usize, CompressFixture)> { |
| 214 | + sizes |
| 215 | + .iter() |
| 216 | + .flat_map(|&size| { |
| 217 | + COMPRESS_PATTERNS |
| 218 | + .iter() |
| 219 | + .map(move |&(name, generator)| (size, CompressFixture::new(name, generator, size))) |
| 220 | + }) |
| 221 | + .collect() |
| 222 | +} |
| 223 | + |
| 224 | +/// Build fixtures for every `ALL_PATTERNS` at a single size. |
| 225 | +pub fn ratio_fixtures(size: usize) -> Vec<CompressFixture> { |
| 226 | + ALL_PATTERNS |
| 227 | + .iter() |
| 228 | + .map(|&(name, generator)| CompressFixture::new(name, generator, size)) |
| 229 | + .collect() |
| 230 | +} |
| 231 | + |
| 232 | +/// One row for the block-size benchmark. |
| 233 | +pub struct BlockSizeFixture { |
| 234 | + pub block_size: NonZeroU32, |
| 235 | + pub data: Vec<u32>, |
| 236 | + pub compressed: Vec<u32>, |
| 237 | +} |
| 238 | + |
| 239 | +impl BlockSizeFixture { |
| 240 | + fn new(block_size: NonZeroU32, size: usize) -> Self { |
| 241 | + let data = generate_uniform_data_small_value_distribution(size); |
| 242 | + let compressed = prepare_compressed_data(&data, block_size); |
| 243 | + Self { |
| 244 | + block_size, |
| 245 | + data, |
| 246 | + compressed, |
| 247 | + } |
| 248 | + } |
| 249 | +} |
| 250 | + |
| 251 | +/// Build fixtures for both block sizes at a given `size`. |
| 252 | +pub fn block_size_fixtures(size: usize) -> Vec<BlockSizeFixture> { |
| 253 | + [BLOCK_SIZE_128, BLOCK_SIZE_256] |
| 254 | + .iter() |
| 255 | + .map(|&bs| BlockSizeFixture::new(bs, size)) |
| 256 | + .collect() |
| 257 | +} |
| 258 | + |
| 259 | +/// One row for the C++ vs Rust decode benchmark. |
| 260 | +#[cfg(feature = "cpp")] |
| 261 | +pub struct CppDecodeFixture { |
| 262 | + pub name: &'static str, |
| 263 | + pub cpp_compressed: Vec<u32>, |
| 264 | + pub rust_compressed: Vec<u32>, |
| 265 | + pub original_len: usize, |
| 266 | +} |
| 267 | + |
| 268 | +#[cfg(feature = "cpp")] |
| 269 | +impl CppDecodeFixture { |
| 270 | + fn new(name: &'static str, generator: DataGeneratorFn, size: usize) -> Self { |
| 271 | + use fastpfor::cpp::FastPFor128Codec; |
| 272 | + let data = generator(size); |
| 273 | + let codec = FastPFor128Codec::new(); |
| 274 | + let cpp_compressed = cpp_encode(&codec, &data); |
| 275 | + let rust_compressed = prepare_compressed_data(&data, BLOCK_SIZE_128); |
| 276 | + Self { |
| 277 | + name, |
| 278 | + cpp_compressed, |
| 279 | + rust_compressed, |
| 280 | + original_len: size, |
| 281 | + } |
| 282 | + } |
| 283 | +} |
| 284 | + |
| 285 | +/// Build C++ vs Rust decode fixtures for every `COMPRESS_PATTERNS × sizes` combination. |
| 286 | +#[cfg(feature = "cpp")] |
| 287 | +pub fn cpp_decode_fixtures(sizes: &[usize]) -> Vec<(usize, CppDecodeFixture)> { |
| 288 | + sizes |
| 289 | + .iter() |
| 290 | + .flat_map(|&size| { |
| 291 | + COMPRESS_PATTERNS |
| 292 | + .iter() |
| 293 | + .map(move |&(name, generator)| (size, CppDecodeFixture::new(name, generator, size))) |
| 294 | + }) |
| 295 | + .collect() |
| 296 | +} |
0 commit comments