From 8bb72c9da9e0086dc4992e70e9647da3752ecfff Mon Sep 17 00:00:00 2001 From: arc Date: Fri, 4 Apr 2025 16:24:47 -0600 Subject: [PATCH 01/16] refactor: move compress_fastest to a new file --- src/encoding/frame_compressor.rs | 56 +++++--------------------- src/encoding/levels/fastest.rs | 67 ++++++++++++++++++++++++++++++++ src/encoding/levels/mod.rs | 2 + src/encoding/mod.rs | 4 +- 4 files changed, 82 insertions(+), 47 deletions(-) create mode 100644 src/encoding/levels/fastest.rs create mode 100644 src/encoding/levels/mod.rs diff --git a/src/encoding/frame_compressor.rs b/src/encoding/frame_compressor.rs index fe7265ab4..727f917c3 100644 --- a/src/encoding/frame_compressor.rs +++ b/src/encoding/frame_compressor.rs @@ -9,14 +9,11 @@ use twox_hash::XxHash64; use core::hash::Hasher; use super::{ - block_header::BlockHeader, blocks::compress_block, frame_header::FrameHeader, + block_header::BlockHeader, frame_header::FrameHeader, levels::*, match_generator::MatchGeneratorDriver, CompressionLevel, Matcher, }; -use crate::{ - common::MAX_BLOCK_SIZE, - io::{Read, Write}, -}; +use crate::io::{Read, Write}; /// An interface for compressing arbitrary data with the ZStandard compression algorithm. /// @@ -106,13 +103,14 @@ impl FrameCompressor { /// To avoid endlessly encoding from a potentially endless source (like a network socket) you can use the /// [Read::take] function pub fn compress(&mut self) { + // Clearing buffers to allow re-using of the compressor self.state.matcher.reset(self.compression_level); self.state.last_huff_table = None; let source = self.uncompressed_data.as_mut().unwrap(); let drain = self.compressed_data.as_mut().unwrap(); - - let mut output = Vec::with_capacity(1024 * 130); - let output = &mut output; + // As the frame is compressed, it's stored here + let output: &mut Vec = &mut Vec::with_capacity(1024 * 130); + // First write the frame header let header = FrameHeader { frame_content_size: None, single_segment: false, @@ -120,10 +118,10 @@ impl FrameCompressor { dictionary_id: None, window_size: Some(self.state.matcher.window_size()), }; - header.serialize(output); - + // Now compress block by block loop { + // Read a single block's worth of uncompressed data from the input let mut uncompressed_data = self.state.matcher.get_next_space(); let mut read_bytes = 0; let last_block; @@ -140,6 +138,7 @@ impl FrameCompressor { } } uncompressed_data.resize(read_bytes, 0); + // As we read, hash that data too #[cfg(feature = "hash")] self.hasher.write(&uncompressed_data); // Special handling is needed for compression of a totally empty file (why you'd want to do that, I don't know) @@ -168,42 +167,7 @@ impl FrameCompressor { output.extend_from_slice(&uncompressed_data); } CompressionLevel::Fastest => { - if uncompressed_data.iter().all(|x| uncompressed_data[0].eq(x)) { - let rle_byte = uncompressed_data[0]; - self.state.matcher.commit_space(uncompressed_data); - self.state.matcher.skip_matching(); - let header = BlockHeader { - last_block, - block_type: crate::blocks::block::BlockType::RLE, - block_size: read_bytes.try_into().unwrap(), - }; - // Write the header, then the block - header.serialize(output); - output.push(rle_byte); - } else { - let mut compressed = Vec::new(); - self.state.matcher.commit_space(uncompressed_data); - compress_block(&mut self.state, &mut compressed); - if compressed.len() >= MAX_BLOCK_SIZE as usize { - let header = BlockHeader { - last_block, - block_type: crate::blocks::block::BlockType::Raw, - block_size: read_bytes.try_into().unwrap(), - }; - // Write the header, then the block - header.serialize(output); - output.extend_from_slice(self.state.matcher.get_last_space()); - } else { - let header = BlockHeader { - last_block, - block_type: crate::blocks::block::BlockType::Compressed, - block_size: (compressed.len()).try_into().unwrap(), - }; - // Write the header, then the block - header.serialize(output); - output.extend(compressed); - } - } + compress_fastest(&mut self.state, last_block, uncompressed_data, output) } _ => { unimplemented!(); diff --git a/src/encoding/levels/fastest.rs b/src/encoding/levels/fastest.rs new file mode 100644 index 000000000..4ec875727 --- /dev/null +++ b/src/encoding/levels/fastest.rs @@ -0,0 +1,67 @@ +use crate::{ + common::MAX_BLOCK_SIZE, + encoding::{ + block_header::BlockHeader, blocks::compress_block, frame_compressor::CompressState, Matcher, + }, +}; +use alloc::vec::Vec; + +/// Compresses a single block at [`crate::encoding::CompressionLevel::Fastest`]. +/// +/// # Parameters +/// - `state`: [`CompressState`] so the compressor can refer to data before +/// the start of this block +/// - `last_block`: Whether or not this block is going to be the last block in the frame +/// (needed because this info is written into the block header) +/// - `uncompressed_data`: A block's worth of uncompressed data, taken from the +/// larger input +/// - `output`: As `uncompressed_data` is compressed, it's appended to `output`. +#[inline] +pub fn compress_fastest( + state: &mut CompressState, + last_block: bool, + uncompressed_data: Vec, + output: &mut Vec, +) { + let block_size = uncompressed_data.len() as u32; + // First check to see if run length encoding can be used for the entire block + if uncompressed_data.iter().all(|x| uncompressed_data[0].eq(x)) { + let rle_byte = uncompressed_data[0]; + state.matcher.commit_space(uncompressed_data); + state.matcher.skip_matching(); + let header = BlockHeader { + last_block, + block_type: crate::blocks::block::BlockType::RLE, + block_size, + }; + // Write the header, then the block + header.serialize(output); + output.push(rle_byte); + } else { + // Compress as a standard compressed block + let mut compressed = Vec::new(); + state.matcher.commit_space(uncompressed_data); + compress_block(state, &mut compressed); + // If the compressed data is larger than the maximum + // allowable block size, instead store uncompressed + if compressed.len() >= MAX_BLOCK_SIZE as usize { + let header = BlockHeader { + last_block, + block_type: crate::blocks::block::BlockType::Raw, + block_size, + }; + // Write the header, then the block + header.serialize(output); + output.extend_from_slice(state.matcher.get_last_space()); + } else { + let header = BlockHeader { + last_block, + block_type: crate::blocks::block::BlockType::Compressed, + block_size: compressed.len() as u32, + }; + // Write the header, then the block + header.serialize(output); + output.extend(compressed); + } + } +} diff --git a/src/encoding/levels/mod.rs b/src/encoding/levels/mod.rs new file mode 100644 index 000000000..fb39caaf8 --- /dev/null +++ b/src/encoding/levels/mod.rs @@ -0,0 +1,2 @@ +mod fastest; +pub use fastest::compress_fastest; diff --git a/src/encoding/mod.rs b/src/encoding/mod.rs index 62b1fdd94..33c98a591 100644 --- a/src/encoding/mod.rs +++ b/src/encoding/mod.rs @@ -7,6 +7,7 @@ pub(crate) mod match_generator; pub(crate) mod util; mod frame_compressor; +mod levels; pub use frame_compressor::FrameCompressor; use crate::io::{Read, Write}; @@ -68,7 +69,8 @@ pub enum CompressionLevel { /// making their own tradeoffs between runtime, memory usage and compression ratio /// /// This trait operates on buffers that represent the chunks of data the matching algorithm wants to work on. -/// One or more of these buffers represent the window the decoder will need to decode the data again. +/// Each one of these buffers is referred to as a *space*. One or more of these buffers represent the window +/// the decoder will need to decode the data again. /// /// This library asks the Matcher for a new buffer using `get_next_space` to allow reusing of allocated buffers when they are no longer part of the /// window of data that is being used for matching. From 4cb43dbabb784c845bd26bd6c02e481af0cd3c19 Mon Sep 17 00:00:00 2001 From: arc Date: Fri, 18 Apr 2025 10:44:00 -0600 Subject: [PATCH 02/16] sync --- src/encoding/blocks/compressed.rs | 1 + src/encoding/levels/mod.rs | 2 ++ src/encoding/mod.rs | 2 +- src/lib.rs | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/encoding/blocks/compressed.rs b/src/encoding/blocks/compressed.rs index 7cf956c28..ad6e89a6d 100644 --- a/src/encoding/blocks/compressed.rs +++ b/src/encoding/blocks/compressed.rs @@ -8,6 +8,7 @@ use crate::{ huff0::huff0_encoder, }; +/// A block of [`crate::common::BlockType::Compressed`] pub fn compress_block(state: &mut CompressState, output: &mut Vec) { let mut literals_vec = Vec::new(); let mut sequences = Vec::new(); diff --git a/src/encoding/levels/mod.rs b/src/encoding/levels/mod.rs index fb39caaf8..ce6f66bd8 100644 --- a/src/encoding/levels/mod.rs +++ b/src/encoding/levels/mod.rs @@ -1,2 +1,4 @@ mod fastest; pub use fastest::compress_fastest; +mod default; +pub use default::compress_default; diff --git a/src/encoding/mod.rs b/src/encoding/mod.rs index 33c98a591..2d9797e9b 100644 --- a/src/encoding/mod.rs +++ b/src/encoding/mod.rs @@ -105,7 +105,7 @@ pub enum Sequence<'data> { /// Is encoded as a sequence for the decoder sequence execution. /// /// First the literals will be copied to the decoded data, - /// then `match_len` bytes are copied from `offset` bytes back in the buffer + /// then `match_len` bytes are copied from `offset` bytes back in the decoded data Triple { literals: &'data [u8], offset: usize, diff --git a/src/lib.rs b/src/lib.rs index 0d87f5ee3..6ca080bba 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -36,6 +36,7 @@ mod bit_io; mod common; pub mod decoding; pub mod encoding; +mod dictionary; pub(crate) mod blocks; From 0390c9aa3f08435e2b37ae1598446f5dcbf9efd3 Mon Sep 17 00:00:00 2001 From: arc Date: Mon, 21 Jul 2025 12:43:01 -0600 Subject: [PATCH 03/16] feat(dict): bare structure of dictionary creation --- Cargo.toml | 1 + src/dictionary/cover.rs | 127 +++++++++++++++++++++++++++++++ src/dictionary/frequency.rs | 71 ++++++++++++++++++ src/dictionary/mod.rs | 25 +++++++ src/dictionary/reservoir.rs | 133 +++++++++++++++++++++++++++++++++ src/encoding/levels/default.rs | 27 +++++++ src/lib.rs | 2 +- 7 files changed, 385 insertions(+), 1 deletion(-) create mode 100644 src/dictionary/cover.rs create mode 100644 src/dictionary/frequency.rs create mode 100644 src/dictionary/mod.rs create mode 100644 src/dictionary/reservoir.rs create mode 100644 src/encoding/levels/default.rs diff --git a/Cargo.toml b/Cargo.toml index e45c2b395..5d5207df7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"] compiler_builtins = { version = "0.1.2", optional = true } core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" } alloc = { version = "1.0.0", optional = true, package = "rustc-std-workspace-alloc" } +fastrand = "2.3.0" [dev-dependencies] criterion = "0.5" diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs new file mode 100644 index 000000000..0179ce3f9 --- /dev/null +++ b/src/dictionary/cover.rs @@ -0,0 +1,127 @@ +//! An implementation of the dictionary generation algorithm +//! described in the paper "Effective Construction of Relative Lempel-Ziv Dictionaries", +//! by Liao, Petri, Moffat, and Wirth, published under the University of Melbourne. +//! +//! See: https://people.eng.unimelb.edu.au/ammoffat/abstracts/lpmw16www.pdf +//! +//! Facebook's implementation was also used as a reference. +//! https://github.com/facebook/zstd/tree/dev/lib/dictBuilder + +use std::collections::HashMap; +use std::vec::Vec; + +use crate::dictionary::frequency::compute_frequency; + +/// A set of values that are used during dictionary construction. +/// +/// Changing these values can improve the resulting dictionary size for certain datasets. +struct DictParams { + /// Segment size. + /// + /// As found under "4. Experiments - Varying Segment Size" in the original paper, a + /// segment size of 2 kiB was effective. + /// + /// "We explored a range of [segment_size] values and found the performance of LMC is insensitive + /// to [segment_size]. We fix [segment_size] to 2kiB + /// + /// Reasonable range: [16, 2048+] + segment_size: u32, + /// k-mer size + /// + ///As found under "4: Experiments - Varying k-mer Size" in the original paper, + /// "when k = 16, across all our text collections, there is a reasonable spread" + /// + /// Reasonable range: [6, 16] + /// + /// For now this value is ignored, and globally set to 16. + k: u32, +} + +struct Segment { + /// Relative to the beginning of the epoch, + /// the index of the start of the segment + starting_offset: u32, + /// A measure of how "ideal" a given segment would be to include in the dictionary. + score: u32, +} + +/// A re-usable allocation containing large allocations +/// that are used multiple times during dictionary construction (once per epoch) +struct Context { + /// Keeps track of the number of occurances of a particular k-mer + frequencies: HashMap<[u8; 2], usize>, + /// A collection of k-mers to be used in the final dictionary + pool: Vec<[u8; 2]>, +} + +impl Context { + fn new() -> Self { + Self { + frequencies: HashMap::new(), + pool: Vec::new(), + } + } +} + +/// Returns the highest scoring segment in an epoch +/// as a slice of that epoch. +fn pick_best_segment<'epoch>( + params: DictParams, + ctx: &mut Context, + epoch: &'epoch [u8], +) -> &'epoch [u8] { + let mut best_segment: &[u8] = &epoch[0..params.segment_size as usize]; + let mut top_segment_score = 0; + // Iterate over segments and score each segment, keeping track of the best segment + for segment in epoch.chunks(params.segment_size as usize) { + let segment_score = score_segment(ctx, epoch, segment); + if segment_score > top_segment_score { + best_segment = segment; + top_segment_score = segment_score; + } + } + + best_segment +} + +/// Given a segment, compute the score (or usefulness) of that segment against the entire epoch. +/// +/// `score_segment` modifies ctx.frequencies. +fn score_segment(ctx: &mut Context, epoch: &[u8], segment: &[u8]) -> usize { + let mut segment_score = 0; + // Determine the score of each overlapping k-mer + for i in 0..segment.len() - 1 { + let kmer = [segment[i], segment[i + 1]]; + // if the kmer is already in the pool, it recieves a score of zero + if !ctx.frequencies.contains_key(&kmer) { + continue; + } + let kmer_score = compute_frequency(kmer, epoch); + ctx.frequencies.insert(kmer, kmer_score); + segment_score += kmer_score; + } + + segment_score +} + +/// Computes the number of epochs and the size of each epoch. +/// +/// Returns a (number of epochs, epoch size) tuple. +/// +/// A translation of `COVER_epoch_info_t COVER_computeEpochs()` from facebook/zstd. +fn compute_epoch_info( + params: DictParams, + max_dict_size: usize, + num_kmers: usize, +) -> (usize, usize) { + let min_epoch_size = 10_000; // 10 KiB + let mut num_epochs: usize = usize::max(1, max_dict_size / params.segment_size as usize); + let mut epoch_size: usize = num_kmers / num_epochs; + if epoch_size >= min_epoch_size { + assert!(epoch_size * num_epochs <= num_kmers); + return (num_epochs, epoch_size); + } + epoch_size = usize::min(min_epoch_size, num_kmers); + num_epochs = num_kmers / epoch_size; + (num_epochs, epoch_size) +} diff --git a/src/dictionary/frequency.rs b/src/dictionary/frequency.rs new file mode 100644 index 000000000..5661af3dd --- /dev/null +++ b/src/dictionary/frequency.rs @@ -0,0 +1,71 @@ +//! Contains `compute_frequency`, a function +//! that uses a rolling Karp-Rabin hash to +//! efficiently count the number of occurences +//! of a given k-mer within a set. + +/// Computes a best effort guess as to how many times `pattern` occurs within +/// `body`. While not 100% accurate, it will be accurate the vast majority of time +pub(super) fn compute_frequency(pattern: [u8; 2], body: &[u8]) -> usize { + assert!(body.len() >= pattern.len()); + // A prime number for modulo operations to reduce collisions (q) + const PRIME: usize = 2654435761; + // Number of characters in the input alphabet (d) + const ALPHABET_SIZE: usize = 256; + // Hash of input pattern (p) + let mut input_hash: usize = 0; + // Hash of the current window of text (t) + let mut window_hash: usize = 0; + // High-order digit multiplier (h) + let mut h: usize = 1; + + // Precompute h (?) + h = (h * ALPHABET_SIZE) % PRIME; + + // Compute initial hash values + for i in 0..pattern.len() { + input_hash = (ALPHABET_SIZE * input_hash + pattern[i] as usize) % PRIME; + window_hash = (ALPHABET_SIZE * window_hash + body[i] as usize) % PRIME; + } + + let mut num_occurances = 0; + for i in 0..=body.len() - pattern.len() { + // There's *probably* a match if these two match + if input_hash == window_hash { + num_occurances += 1; + } + + // Compute hash values for next window + if i < body.len() - pattern.len() { + window_hash = (ALPHABET_SIZE * (window_hash - body[i] as usize * h) + + body[i + pattern.len()] as usize) + % PRIME; + } + } + + num_occurances +} + +#[cfg(test)] +mod tests { + use super::compute_frequency; + #[test] + fn dead_beef() { + assert_eq!( + compute_frequency([0xde, 0xad], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]), + 2 + ); + } + + #[test] + fn smallest_body() { + assert_eq!(compute_frequency([0x00, 0xff], &[0x00, 0xff]), 1); + } + + #[test] + fn no_match() { + assert_eq!( + compute_frequency([0xff, 0xff], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]), + 0 + ); + } +} diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs new file mode 100644 index 000000000..ba700660e --- /dev/null +++ b/src/dictionary/mod.rs @@ -0,0 +1,25 @@ +//! Code for creating a separate content dictionary. +//! +//! Implemented following the paper "Effective construction of +//! Relative Lempel-Ziv Dictionaries", by Kewen Liao, Matthias Petri, +//! Alistair Moffat, and Anthony Wirth + +// The algorithm is summarized here +// 1. The text is split into "epochs", or chunks from the original source +// 2. From within each epoch, we select the "segment", or 1 KiB contiguous section +// that's predicted to be the best option to include in the dictionary. Concatenated, +// these segments form the dictionary. +// +// This segment scoring algorithm operates as follows: +// For a given epoch: +// - Run a reservoir sampler over the entire epoch, creating a +// reservoir of n/t, where `t` is the desired number of occurances +// we want the most common k-mers to have +// - Have the ability to estimate +// the frequency of a given k-mer: f(w: k-mer) calculates +// the frequency of w in the reservoir using a rolling karp-rabin hash +// - The score of a segment is the sum of `f(w)` called on every kmer within the segment + +mod cover; +mod frequency; +mod reservoir; diff --git a/src/dictionary/reservoir.rs b/src/dictionary/reservoir.rs new file mode 100644 index 000000000..40cfcd4b8 --- /dev/null +++ b/src/dictionary/reservoir.rs @@ -0,0 +1,133 @@ +use crate::io; +use core::f64::consts::E; +use std::{dbg, io::ErrorKind}; +use fastrand; +use alloc::vec::Vec; + +/// A reservoir is created from an input stream. +/// +/// Once filled, it will contain a best effort sample of a dataset, +/// where each input value has an equivalent probability of being included. +struct Reservoir { + /// Where the sampled data is stored + lake: Vec, + /// K is the size of each sample. + /// + /// The original Zstd dictionary implementation states that values + /// between 16 and 2048+ are reasonable. + k: u16, +} + +impl Reservoir { + pub fn new(size: usize) -> Self { + assert!(size >= 16, "Reservoirs cannot be below 16 bytes in size"); + let mut lake = Vec::with_capacity(size); + lake.resize(size, 0); + let k: u16 = 16; + Self { + lake, + k + } + } + /// Filling the reservoir is performed using Algorithm L. + /// + /// The return value is the populated reservoir. + pub fn fill(mut self, source: &mut R) -> Result, io::Error> { + // https://en.wikipedia.org/wiki/Reservoir_sampling#:~:text=end%0A%20%20end%0Aend-,Optimal%3A%20Algorithm,-L%5Bedit + // https://richardstartin.github.io/posts/reservoir-sampling#algorithm-l:~:text=%3B%0A%20%20%20%20%7D%0A%7D-,Algorithm%20L,-Algorithm%20L%20was + // First fill the reservoir with the start of the input stream + let mut total_bytes_read: usize = 0; + while let Ok(num_bytes) = source.read(self.lake.as_mut_slice()) { + total_bytes_read += num_bytes; + // Stop when we've completely filled the buffer + if total_bytes_read == self.lake.len() { + break; + } + // If we haven't filled the lake all the way, resize it + if num_bytes == 0 { + self.lake.resize(total_bytes_read, 0); + } + } + + let mut threshold = E.powf(fastrand::f64().ln() / f64::from(self.k)); + // An index into the stream of the next sample to take + let mut next = self.lake.len(); + // Because we're sampling k-mers of size K into the lake, + // split the lake into chunks of k size for simplicity + let mut lake_chunks = self + .lake + .chunks_mut(self.k as usize) + .collect::>(); + + let end_of_lake = lake_chunks.len(); + let mut counter = end_of_lake / self.k as usize; + // Algorithm L is considered better than algorithm R because it + // determines how many inputs can be skipped, rather than + // processing every input. + + // This is done by abusing the statistics in ways + // I do not understand. + + // Items with a weight smaller than the threshold enter the lake, + // replacing the item in the lake with the largest threshold + let mut dumpster = Vec::with_capacity(self.k as usize); + loop { + let num_bytes_read; + if counter == next { + num_bytes_read = source + .read(lake_chunks[fastrand::usize(0..end_of_lake)]) + .unwrap(); + // Advance at least to the next sample, skipping forward a few samples + next += ((fastrand::f64().ln() / f64::ln(1.0 - threshold)).floor() as usize + 1) + * self.k as usize; + // Update the threshold to reflect changes + threshold *= E.powf(fastrand::f64().ln() / f64::from(end_of_lake as u32)) + } else { + // Drop the next chunk + num_bytes_read = source.read(&mut dumpster).unwrap(); + } + if num_bytes_read == 0 { + break; + } + counter += self.k as usize; + } + + Ok(self.lake) + } +} + +#[cfg(test)] +mod tests { + use super::Reservoir; + use alloc::vec; + + #[test] + fn initial_fill() { + // Create a reservoir 16 bytes in size and read + // 16 bytes into it + let r = Reservoir::new(16); + let test_data = vec![0_u8; 16]; + let output = r.fill(&mut test_data.as_slice()).unwrap(); + assert_eq!(test_data, output); + } + + #[test] + fn shrinks_for_small_sample() { + // Create a reservoir larger than the sample. + // The output should be smaller. + let r = Reservoir::new(32); + let test_data = vec![0_u8; 28]; + let output = r.fill(&mut test_data.as_slice()).unwrap(); + assert!(output.len() == 28); + } + + #[test] + fn lake_doesnt_grow() { + // Create a sample larger than the reservoir + // The output should be smaller. + let r = Reservoir::new(32); + let test_data = vec![0_u8; 16_000_000]; + let output = r.fill(&mut test_data.as_slice()).unwrap(); + assert!(output.len() == 32); + } +} \ No newline at end of file diff --git a/src/encoding/levels/default.rs b/src/encoding/levels/default.rs new file mode 100644 index 000000000..4b83bd246 --- /dev/null +++ b/src/encoding/levels/default.rs @@ -0,0 +1,27 @@ +use crate::{ + common::MAX_BLOCK_SIZE, + encoding::{ + block_header::BlockHeader, blocks::compress_block, frame_compressor::CompressState, Matcher, + }, +}; +use alloc::vec::Vec; + +/// Compresses a single block at [`crate::encoding::CompressionLevel::Default`]. +/// +/// # Parameters +/// - `state`: [`CompressState`] so the compressor can refer to data prior to +/// the start of this block +/// - `last_block`: Whether or not this block is going to be the last block in the frame +/// (needed because this info is written into the block header) +/// - `uncompressed_data`: A block's worth of uncompressed data, taken from the +/// larger input +/// - `output`: As `uncompressed_data` is compressed, it's appended to `output`. +#[inline] +pub fn compress_default( + state: &mut CompressState, + last_block: bool, + uncompressed_data: Vec, + output: &mut Vec, +) { + let block_size = uncompressed_data.len() as u32; +} diff --git a/src/lib.rs b/src/lib.rs index 6ca080bba..5d456bd12 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,8 +35,8 @@ macro_rules! vprintln { mod bit_io; mod common; pub mod decoding; -pub mod encoding; mod dictionary; +pub mod encoding; pub(crate) mod blocks; From 31974c1ea08d668bbaec0d667b0160cc3f3c81b5 Mon Sep 17 00:00:00 2001 From: arc Date: Fri, 25 Jul 2025 12:33:42 -0600 Subject: [PATCH 04/16] . --- src/dictionary/cover.rs | 52 +++++++++++++++++++++---------------- src/dictionary/frequency.rs | 2 +- src/dictionary/reader.rs | 2 ++ src/dictionary/reservoir.rs | 13 ++++------ 4 files changed, 38 insertions(+), 31 deletions(-) create mode 100644 src/dictionary/reader.rs diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs index 0179ce3f9..ad436ffa0 100644 --- a/src/dictionary/cover.rs +++ b/src/dictionary/cover.rs @@ -12,6 +12,14 @@ use std::vec::Vec; use crate::dictionary::frequency::compute_frequency; +/// The size of each k-mer +pub(super) const K: usize = 16; +///As found under "4: Experiments - Varying k-mer Size" in the original paper, +/// "when k = 16, across all our text collections, there is a reasonable spread" +/// +/// Reasonable range: [6, 16] +pub(super) type KMer = [u8; K]; + /// A set of values that are used during dictionary construction. /// /// Changing these values can improve the resulting dictionary size for certain datasets. @@ -26,32 +34,29 @@ struct DictParams { /// /// Reasonable range: [16, 2048+] segment_size: u32, - /// k-mer size - /// - ///As found under "4: Experiments - Varying k-mer Size" in the original paper, - /// "when k = 16, across all our text collections, there is a reasonable spread" - /// - /// Reasonable range: [6, 16] - /// - /// For now this value is ignored, and globally set to 16. - k: u32, } struct Segment { - /// Relative to the beginning of the epoch, - /// the index of the start of the segment - starting_offset: u32, - /// A measure of how "ideal" a given segment would be to include in the dictionary. - score: u32, + /// The actual contents of the segment. + raw: Vec, + /// A measure of how "ideal" a given segment would be to include in the dictionary + /// + /// Higher is better, there's no upper limit. This number is determined by + /// estimating the number of occurances in a given epoch + score: usize, } /// A re-usable allocation containing large allocations /// that are used multiple times during dictionary construction (once per epoch) struct Context { - /// Keeps track of the number of occurances of a particular k-mer - frequencies: HashMap<[u8; 2], usize>, - /// A collection of k-mers to be used in the final dictionary - pool: Vec<[u8; 2]>, + /// Keeps track of the number of occurances of a particular k-mer within an epoch. + /// + /// Reset for each epoch. + frequencies: HashMap, + /// A collection of segments to be used in the final dictionary. + /// + /// Contains the best segment from every epoch. + pool: Vec, } impl Context { @@ -69,9 +74,9 @@ fn pick_best_segment<'epoch>( params: DictParams, ctx: &mut Context, epoch: &'epoch [u8], -) -> &'epoch [u8] { +) -> Segment { let mut best_segment: &[u8] = &epoch[0..params.segment_size as usize]; - let mut top_segment_score = 0; + let mut top_segment_score: usize = 0; // Iterate over segments and score each segment, keeping track of the best segment for segment in epoch.chunks(params.segment_size as usize) { let segment_score = score_segment(ctx, epoch, segment); @@ -81,7 +86,10 @@ fn pick_best_segment<'epoch>( } } - best_segment + Segment { + raw: best_segment.into(), + score: top_segment_score, + } } /// Given a segment, compute the score (or usefulness) of that segment against the entire epoch. @@ -91,7 +99,7 @@ fn score_segment(ctx: &mut Context, epoch: &[u8], segment: &[u8]) -> usize { let mut segment_score = 0; // Determine the score of each overlapping k-mer for i in 0..segment.len() - 1 { - let kmer = [segment[i], segment[i + 1]]; + let kmer: &KMer = &(segment[i..i + K].try_into().expect("Failed to make kmer")); // if the kmer is already in the pool, it recieves a score of zero if !ctx.frequencies.contains_key(&kmer) { continue; diff --git a/src/dictionary/frequency.rs b/src/dictionary/frequency.rs index 5661af3dd..3c9d489e6 100644 --- a/src/dictionary/frequency.rs +++ b/src/dictionary/frequency.rs @@ -5,7 +5,7 @@ /// Computes a best effort guess as to how many times `pattern` occurs within /// `body`. While not 100% accurate, it will be accurate the vast majority of time -pub(super) fn compute_frequency(pattern: [u8; 2], body: &[u8]) -> usize { +pub(super) fn compute_frequency(pattern: KMer, body: &[u8]) -> usize { assert!(body.len() >= pattern.len()); // A prime number for modulo operations to reduce collisions (q) const PRIME: usize = 2654435761; diff --git a/src/dictionary/reader.rs b/src/dictionary/reader.rs new file mode 100644 index 000000000..d283ec099 --- /dev/null +++ b/src/dictionary/reader.rs @@ -0,0 +1,2 @@ +//! Provides an interface for reading from a large number of files without loading them all into +//! memory diff --git a/src/dictionary/reservoir.rs b/src/dictionary/reservoir.rs index 40cfcd4b8..77b2a9317 100644 --- a/src/dictionary/reservoir.rs +++ b/src/dictionary/reservoir.rs @@ -1,8 +1,8 @@ +use super::cover::K; use crate::io; +use alloc::vec::Vec; use core::f64::consts::E; -use std::{dbg, io::ErrorKind}; use fastrand; -use alloc::vec::Vec; /// A reservoir is created from an input stream. /// @@ -23,11 +23,8 @@ impl Reservoir { assert!(size >= 16, "Reservoirs cannot be below 16 bytes in size"); let mut lake = Vec::with_capacity(size); lake.resize(size, 0); - let k: u16 = 16; - Self { - lake, - k - } + let k = K as u16; + Self { lake, k } } /// Filling the reservoir is performed using Algorithm L. /// @@ -130,4 +127,4 @@ mod tests { let output = r.fill(&mut test_data.as_slice()).unwrap(); assert!(output.len() == 32); } -} \ No newline at end of file +} From e3e99e061d2ee7e92f7661671677b552a520e3bd Mon Sep 17 00:00:00 2001 From: arc Date: Tue, 29 Jul 2025 19:52:45 -0600 Subject: [PATCH 05/16] . --- src/dictionary/cover.rs | 21 +++++++++++++++------ src/dictionary/frequency.rs | 8 ++++---- src/dictionary/mod.rs | 8 +++++++- src/dictionary/reader.rs | 2 -- src/dictionary/reservoir.rs | 37 ++++++++++++++++++++++++++----------- 5 files changed, 52 insertions(+), 24 deletions(-) delete mode 100644 src/dictionary/reader.rs diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs index ad436ffa0..a859986da 100644 --- a/src/dictionary/cover.rs +++ b/src/dictionary/cover.rs @@ -7,11 +7,13 @@ //! Facebook's implementation was also used as a reference. //! https://github.com/facebook/zstd/tree/dev/lib/dictBuilder +use crate::dictionary::frequency::compute_frequency; +use crate::dictionary::reservoir::create_sample; +use core::convert::TryInto; use std::collections::HashMap; +use std::io::Cursor; use std::vec::Vec; -use crate::dictionary::frequency::compute_frequency; - /// The size of each k-mer pub(super) const K: usize = 16; ///As found under "4: Experiments - Varying k-mer Size" in the original paper, @@ -96,16 +98,23 @@ fn pick_best_segment<'epoch>( /// /// `score_segment` modifies ctx.frequencies. fn score_segment(ctx: &mut Context, epoch: &[u8], segment: &[u8]) -> usize { + // Create a reservoir sample of the entire epoch + // so we can estimate frequencies without checking the entire epoch + // TODO: epoch size / 10 was chosen randomly, find a better way to determine reservoir size + let epoch_sample = create_sample(&mut Cursor::new(epoch), epoch.len() / 10); + let mut segment_score = 0; // Determine the score of each overlapping k-mer for i in 0..segment.len() - 1 { - let kmer: &KMer = &(segment[i..i + K].try_into().expect("Failed to make kmer")); + let kmer: &KMer = (&segment[i..i + K]) + .try_into() + .expect("Failed to make kmer"); // if the kmer is already in the pool, it recieves a score of zero - if !ctx.frequencies.contains_key(&kmer) { + if !ctx.frequencies.contains_key(kmer) { continue; } - let kmer_score = compute_frequency(kmer, epoch); - ctx.frequencies.insert(kmer, kmer_score); + let kmer_score = compute_frequency(kmer, &epoch_sample); + ctx.frequencies.insert(*kmer, kmer_score); segment_score += kmer_score; } diff --git a/src/dictionary/frequency.rs b/src/dictionary/frequency.rs index 3c9d489e6..e3035f657 100644 --- a/src/dictionary/frequency.rs +++ b/src/dictionary/frequency.rs @@ -5,7 +5,7 @@ /// Computes a best effort guess as to how many times `pattern` occurs within /// `body`. While not 100% accurate, it will be accurate the vast majority of time -pub(super) fn compute_frequency(pattern: KMer, body: &[u8]) -> usize { +pub(super) fn compute_frequency(pattern: &[u8], body: &[u8]) -> usize { assert!(body.len() >= pattern.len()); // A prime number for modulo operations to reduce collisions (q) const PRIME: usize = 2654435761; @@ -51,20 +51,20 @@ mod tests { #[test] fn dead_beef() { assert_eq!( - compute_frequency([0xde, 0xad], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]), + compute_frequency(&[0xde, 0xad], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]), 2 ); } #[test] fn smallest_body() { - assert_eq!(compute_frequency([0x00, 0xff], &[0x00, 0xff]), 1); + assert_eq!(compute_frequency(&[0x00, 0xff], &[0x00, 0xff]), 1); } #[test] fn no_match() { assert_eq!( - compute_frequency([0xff, 0xff], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]), + compute_frequency(&[0xff, 0xff], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]), 0 ); } diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs index ba700660e..bb031d2e8 100644 --- a/src/dictionary/mod.rs +++ b/src/dictionary/mod.rs @@ -19,7 +19,13 @@ // the frequency of a given k-mer: f(w: k-mer) calculates // the frequency of w in the reservoir using a rolling karp-rabin hash // - The score of a segment is the sum of `f(w)` called on every kmer within the segment - mod cover; mod frequency; mod reservoir; + +use cover::*; +use std::io::{Read, Write}; + +/// Read from `source` to create a dictionary of `dict_size`. The completed dictionary is written +/// to `output`. +pub fn create_dict_from_reader(source: R, &mut output: W, dict_size: usize) {} diff --git a/src/dictionary/reader.rs b/src/dictionary/reader.rs deleted file mode 100644 index d283ec099..000000000 --- a/src/dictionary/reader.rs +++ /dev/null @@ -1,2 +0,0 @@ -//! Provides an interface for reading from a large number of files without loading them all into -//! memory diff --git a/src/dictionary/reservoir.rs b/src/dictionary/reservoir.rs index 77b2a9317..5249f87e0 100644 --- a/src/dictionary/reservoir.rs +++ b/src/dictionary/reservoir.rs @@ -1,15 +1,24 @@ use super::cover::K; -use crate::io; use alloc::vec::Vec; use core::f64::consts::E; use fastrand; +use std::io; + +/// Creates a representative sample of `input` of `size` bytes. +pub fn create_sample(input: &mut R, size: usize) -> Vec { + let reservoir = Reservoir::new(size); + reservoir.fill(input) +} /// A reservoir is created from an input stream. /// /// Once filled, it will contain a best effort sample of a dataset, /// where each input value has an equivalent probability of being included. struct Reservoir { - /// Where the sampled data is stored + /// Where the sampled data is stored. + /// + /// Once the lake is filled, then this should contain a representative sample + /// of the larger dataset. lake: Vec, /// K is the size of each sample. /// @@ -19,6 +28,7 @@ struct Reservoir { } impl Reservoir { + /// Initialize a new empty reservoir, creating an allocation of `size`. pub fn new(size: usize) -> Self { assert!(size >= 16, "Reservoirs cannot be below 16 bytes in size"); let mut lake = Vec::with_capacity(size); @@ -26,12 +36,14 @@ impl Reservoir { let k = K as u16; Self { lake, k } } + /// Filling the reservoir is performed using Algorithm L. /// /// The return value is the populated reservoir. - pub fn fill(mut self, source: &mut R) -> Result, io::Error> { + pub fn fill(mut self, source: &mut R) -> Vec { // https://en.wikipedia.org/wiki/Reservoir_sampling#:~:text=end%0A%20%20end%0Aend-,Optimal%3A%20Algorithm,-L%5Bedit // https://richardstartin.github.io/posts/reservoir-sampling#algorithm-l:~:text=%3B%0A%20%20%20%20%7D%0A%7D-,Algorithm%20L,-Algorithm%20L%20was + // First fill the reservoir with the start of the input stream let mut total_bytes_read: usize = 0; while let Ok(num_bytes) = source.read(self.lake.as_mut_slice()) { @@ -55,7 +67,7 @@ impl Reservoir { .lake .chunks_mut(self.k as usize) .collect::>(); - + // Used when discarding chunks let end_of_lake = lake_chunks.len(); let mut counter = end_of_lake / self.k as usize; // Algorithm L is considered better than algorithm R because it @@ -67,13 +79,15 @@ impl Reservoir { // Items with a weight smaller than the threshold enter the lake, // replacing the item in the lake with the largest threshold + let mut dumpster = Vec::with_capacity(self.k as usize); loop { - let num_bytes_read; + // `num_bytes_read` is kept track of to watch for EOD. + let num_bytes_read: u64; if counter == next { num_bytes_read = source .read(lake_chunks[fastrand::usize(0..end_of_lake)]) - .unwrap(); + .unwrap() as u64; // Advance at least to the next sample, skipping forward a few samples next += ((fastrand::f64().ln() / f64::ln(1.0 - threshold)).floor() as usize + 1) * self.k as usize; @@ -81,7 +95,8 @@ impl Reservoir { threshold *= E.powf(fastrand::f64().ln() / f64::from(end_of_lake as u32)) } else { // Drop the next chunk - num_bytes_read = source.read(&mut dumpster).unwrap(); + num_bytes_read = source.read(&mut dumpster).unwrap() as u64; + //source.seek_relative(self.k.into()).unwrap(); } if num_bytes_read == 0 { break; @@ -89,7 +104,7 @@ impl Reservoir { counter += self.k as usize; } - Ok(self.lake) + self.lake } } @@ -104,7 +119,7 @@ mod tests { // 16 bytes into it let r = Reservoir::new(16); let test_data = vec![0_u8; 16]; - let output = r.fill(&mut test_data.as_slice()).unwrap(); + let output = r.fill(&mut test_data.as_slice()); assert_eq!(test_data, output); } @@ -114,7 +129,7 @@ mod tests { // The output should be smaller. let r = Reservoir::new(32); let test_data = vec![0_u8; 28]; - let output = r.fill(&mut test_data.as_slice()).unwrap(); + let output = r.fill(&mut test_data.as_slice()); assert!(output.len() == 28); } @@ -124,7 +139,7 @@ mod tests { // The output should be smaller. let r = Reservoir::new(32); let test_data = vec![0_u8; 16_000_000]; - let output = r.fill(&mut test_data.as_slice()).unwrap(); + let output = r.fill(&mut test_data.as_slice()); assert!(output.len() == 32); } } From 7e0d4a303af1c11bd7a27c1d780c9468e130e399 Mon Sep 17 00:00:00 2001 From: arc Date: Thu, 31 Jul 2025 20:45:35 -0600 Subject: [PATCH 06/16] dict: more scaffolding for file processing --- src/dictionary/cover.rs | 34 +++++-------------------- src/dictionary/mod.rs | 51 +++++++++++++++++++++++++++++++++++-- src/dictionary/reservoir.rs | 2 +- 3 files changed, 57 insertions(+), 30 deletions(-) diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs index a859986da..f8e319cbb 100644 --- a/src/dictionary/cover.rs +++ b/src/dictionary/cover.rs @@ -7,11 +7,12 @@ //! Facebook's implementation was also used as a reference. //! https://github.com/facebook/zstd/tree/dev/lib/dictBuilder +use super::DictParams; use crate::dictionary::frequency::compute_frequency; use crate::dictionary::reservoir::create_sample; use core::convert::TryInto; use std::collections::HashMap; -use std::io::Cursor; +use std::io::{Cursor, Read}; use std::vec::Vec; /// The size of each k-mer @@ -22,23 +23,7 @@ pub(super) const K: usize = 16; /// Reasonable range: [6, 16] pub(super) type KMer = [u8; K]; -/// A set of values that are used during dictionary construction. -/// -/// Changing these values can improve the resulting dictionary size for certain datasets. -struct DictParams { - /// Segment size. - /// - /// As found under "4. Experiments - Varying Segment Size" in the original paper, a - /// segment size of 2 kiB was effective. - /// - /// "We explored a range of [segment_size] values and found the performance of LMC is insensitive - /// to [segment_size]. We fix [segment_size] to 2kiB - /// - /// Reasonable range: [16, 2048+] - segment_size: u32, -} - -struct Segment { +pub struct Segment { /// The actual contents of the segment. raw: Vec, /// A measure of how "ideal" a given segment would be to include in the dictionary @@ -72,7 +57,7 @@ impl Context { /// Returns the highest scoring segment in an epoch /// as a slice of that epoch. -fn pick_best_segment<'epoch>( +pub fn pick_best_segment<'epoch>( params: DictParams, ctx: &mut Context, epoch: &'epoch [u8], @@ -97,12 +82,7 @@ fn pick_best_segment<'epoch>( /// Given a segment, compute the score (or usefulness) of that segment against the entire epoch. /// /// `score_segment` modifies ctx.frequencies. -fn score_segment(ctx: &mut Context, epoch: &[u8], segment: &[u8]) -> usize { - // Create a reservoir sample of the entire epoch - // so we can estimate frequencies without checking the entire epoch - // TODO: epoch size / 10 was chosen randomly, find a better way to determine reservoir size - let epoch_sample = create_sample(&mut Cursor::new(epoch), epoch.len() / 10); - +fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> usize { let mut segment_score = 0; // Determine the score of each overlapping k-mer for i in 0..segment.len() - 1 { @@ -113,7 +93,7 @@ fn score_segment(ctx: &mut Context, epoch: &[u8], segment: &[u8]) -> usize { if !ctx.frequencies.contains_key(kmer) { continue; } - let kmer_score = compute_frequency(kmer, &epoch_sample); + let kmer_score = compute_frequency(kmer, &collection_sample); ctx.frequencies.insert(*kmer, kmer_score); segment_score += kmer_score; } @@ -126,7 +106,7 @@ fn score_segment(ctx: &mut Context, epoch: &[u8], segment: &[u8]) -> usize { /// Returns a (number of epochs, epoch size) tuple. /// /// A translation of `COVER_epoch_info_t COVER_computeEpochs()` from facebook/zstd. -fn compute_epoch_info( +pub fn compute_epoch_info( params: DictParams, max_dict_size: usize, num_kmers: usize, diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs index bb031d2e8..b16ca235c 100644 --- a/src/dictionary/mod.rs +++ b/src/dictionary/mod.rs @@ -1,9 +1,14 @@ //! Code for creating a separate content dictionary. //! +//! Effective dictionaries are up to 1% the size of the complete training body, +//! and are trained on many examples of the original data. +//! //! Implemented following the paper "Effective construction of //! Relative Lempel-Ziv Dictionaries", by Kewen Liao, Matthias Petri, //! Alistair Moffat, and Anthony Wirth +const GIBIBYTE: usize = 1 << 30; + // The algorithm is summarized here // 1. The text is split into "epochs", or chunks from the original source // 2. From within each epoch, we select the "segment", or 1 KiB contiguous section @@ -24,8 +29,50 @@ mod frequency; mod reservoir; use cover::*; -use std::io::{Read, Write}; +use std::io::{self, BufReader}; + +use crate::dictionary::reservoir::create_sample; + +/// A set of values that are used during dictionary construction. +/// +/// Changing these values can improve the resulting dictionary size for certain datasets. +pub struct DictParams { + /// Segment size. + /// + /// As found under "4. Experiments - Varying Segment Size" in the original paper, a + /// segment size of 2 kiB was effective. + /// + /// "We explored a range of [segment_size] values and found the performance of LMC is insensitive + /// to [segment_size]. We fix [segment_size] to 2kiB + /// + /// Reasonable range: [16, 2048+] + pub segment_size: u32, +} /// Read from `source` to create a dictionary of `dict_size`. The completed dictionary is written /// to `output`. -pub fn create_dict_from_reader(source: R, &mut output: W, dict_size: usize) {} +/// +/// - `source` will be used as training data for the entire dictionary. +/// - `source_size` influences how the data is divided and sampled and is measured +/// in bytes. While this does not need to be exact, estimates should attempt to be +/// larger than the actual collection size. +/// - `output` is where the completed dictionary will be written. +/// - `dict_size` determines how large the complete dictionary should be. The completed +/// dictionary will be this size or smaller. +/// +/// This function uses `BufRead` internally, the provided reader need not be buffered. +pub fn create_dict_from_source( + source: R, + source_size: usize, + output: &mut W, + dict_size: usize, +) { + let params = DictParams { segment_size: 2048 }; + let mut buffered_source = BufReader::new(source); + let sample_size = buffered_source; + let collection_sample = create_sample(&mut buffered_source, 2 * GIBIBYTE); + // According to 4. Experiments - Varying Reservoir Sampler Thresholds, + // setting reservoir size to collection size / min{collection size / 2 * number of segments, + // 256} was effective + let (epoch_size, num_epochs) = compute_epoch_info(params, dict_size, num_kmers); +} diff --git a/src/dictionary/reservoir.rs b/src/dictionary/reservoir.rs index 5249f87e0..041ebc498 100644 --- a/src/dictionary/reservoir.rs +++ b/src/dictionary/reservoir.rs @@ -103,7 +103,7 @@ impl Reservoir { } counter += self.k as usize; } - + self.lake.shrink_to_fit(); self.lake } } From be49b6dbba51544417ff7ac727f587cf93122f43 Mon Sep 17 00:00:00 2001 From: arc Date: Thu, 7 Aug 2025 21:08:23 -0600 Subject: [PATCH 07/16] dict: rudimentary implementation --- src/bin/zstd.rs | 81 ++++++++++++++++++++----------------- src/dictionary/cover.rs | 57 +++++++++++++++++--------- src/dictionary/frequency.rs | 25 +++++++----- src/dictionary/mod.rs | 57 +++++++++++++++++++++++--- src/lib.rs | 4 +- 5 files changed, 151 insertions(+), 73 deletions(-) diff --git a/src/bin/zstd.rs b/src/bin/zstd.rs index bdd80bd7e..07f616308 100644 --- a/src/bin/zstd.rs +++ b/src/bin/zstd.rs @@ -9,6 +9,7 @@ use std::time::Instant; use ruzstd::decoding::errors::FrameDecoderError; use ruzstd::decoding::errors::ReadFrameHeaderError; +use ruzstd::dictionary::create_dict_from_source; use ruzstd::encoding::CompressionLevel; use ruzstd::encoding::FrameCompressor; @@ -153,44 +154,48 @@ impl Read for PercentPrintReader { } fn main() { - let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect(); - let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect(); - file_paths.remove(0); - - if flags.is_empty() { - let mut encoder = FrameCompressor::new(CompressionLevel::Fastest); - encoder.set_drain(Vec::new()); - - for path in file_paths { - let start_instant = Instant::now(); - let file = std::fs::File::open(&path).unwrap(); - let input_len = file.metadata().unwrap().len() as usize; - let file = PercentPrintReader { - reader: BufReader::new(file), - total: input_len, - counter: 0, - last_percent: 0, - }; - encoder.set_source(file); - encoder.compress(); - let mut output: Vec<_> = encoder.take_drain().unwrap(); - println!( - "Compressed {path:} from {} to {} ({}%) took {}ms", - input_len, - output.len(), - if input_len == 0 { - 0 - } else { - output.len() * 100 / input_len - }, - start_instant.elapsed().as_millis() - ); - output.clear(); - encoder.set_drain(output); - } - } else { - decompress(&flags, &file_paths); - } + let input = File::open("local_corpus_files/enwik9").expect("open input file"); + let input_len = input.metadata().unwrap().len() as usize; + let mut output = File::create("output.dict").expect("create output file"); + create_dict_from_source(input, input_len, &mut output, 5_000_000); + //let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect(); + //let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect(); + //file_paths.remove(0); + // + //if flags.is_empty() { + // let mut encoder = FrameCompressor::new(CompressionLevel::Fastest); + // encoder.set_drain(Vec::new()); + // + // for path in file_paths { + // let start_instant = Instant::now(); + // let file = std::fs::File::open(&path).unwrap(); + // let input_len = file.metadata().unwrap().len() as usize; + // let file = PercentPrintReader { + // reader: BufReader::new(file), + // total: input_len, + // counter: 0, + // last_percent: 0, + // }; + // encoder.set_source(file); + // encoder.compress(); + // let mut output: Vec<_> = encoder.take_drain().unwrap(); + // println!( + // "Compressed {path:} from {} to {} ({}%) took {}ms", + // input_len, + // output.len(), + // if input_len == 0 { + // 0 + // } else { + // output.len() * 100 / input_len + // }, + // start_instant.elapsed().as_millis() + // ); + // output.clear(); + // encoder.set_drain(output); + // } + //} else { + // decompress(&flags, &file_paths); + //} } fn do_something(data: &[u8], s: &mut StateTracker) { diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs index f8e319cbb..6cb4d0392 100644 --- a/src/dictionary/cover.rs +++ b/src/dictionary/cover.rs @@ -11,7 +11,7 @@ use super::DictParams; use crate::dictionary::frequency::compute_frequency; use crate::dictionary::reservoir::create_sample; use core::convert::TryInto; -use std::collections::HashMap; +use std::collections::{BinaryHeap, HashMap}; use std::io::{Cursor, Read}; use std::vec::Vec; @@ -25,32 +25,52 @@ pub(super) type KMer = [u8; K]; pub struct Segment { /// The actual contents of the segment. - raw: Vec, + pub raw: Vec, /// A measure of how "ideal" a given segment would be to include in the dictionary /// /// Higher is better, there's no upper limit. This number is determined by /// estimating the number of occurances in a given epoch - score: usize, + pub score: usize, +} + +impl Eq for Segment {} + +impl PartialEq for Segment { + fn eq(&self, other: &Self) -> bool { + // We only really care about score in regards to heap order + self.score == other.score + } +} + +impl PartialOrd for Segment { + fn partial_cmp(&self, other: &Self) -> Option { + match self.score.partial_cmp(&other.score) { + Some(core::cmp::Ordering::Equal) => {} + ord => return ord, + } + self.score.partial_cmp(&other.score) + } +} + +impl Ord for Segment { + fn cmp(&self, other: &Self) -> core::cmp::Ordering { + self.score.cmp(&other.score) + } } /// A re-usable allocation containing large allocations /// that are used multiple times during dictionary construction (once per epoch) -struct Context { +pub struct Context { /// Keeps track of the number of occurances of a particular k-mer within an epoch. /// /// Reset for each epoch. - frequencies: HashMap, - /// A collection of segments to be used in the final dictionary. - /// - /// Contains the best segment from every epoch. - pool: Vec, + pub frequencies: HashMap, } impl Context { fn new() -> Self { Self { frequencies: HashMap::new(), - pool: Vec::new(), } } } @@ -58,15 +78,16 @@ impl Context { /// Returns the highest scoring segment in an epoch /// as a slice of that epoch. pub fn pick_best_segment<'epoch>( - params: DictParams, + params: &DictParams, ctx: &mut Context, - epoch: &'epoch [u8], + collection_sample: &'epoch [u8], ) -> Segment { - let mut best_segment: &[u8] = &epoch[0..params.segment_size as usize]; + vprintln!("\tpick_best: picking best segment in epoch"); + let mut best_segment: &[u8] = &collection_sample[0..params.segment_size as usize]; let mut top_segment_score: usize = 0; // Iterate over segments and score each segment, keeping track of the best segment - for segment in epoch.chunks(params.segment_size as usize) { - let segment_score = score_segment(ctx, epoch, segment); + for segment in collection_sample.chunks(params.segment_size as usize) { + let segment_score = score_segment(ctx, collection_sample, segment); if segment_score > top_segment_score { best_segment = segment; top_segment_score = segment_score; @@ -85,12 +106,12 @@ pub fn pick_best_segment<'epoch>( fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> usize { let mut segment_score = 0; // Determine the score of each overlapping k-mer - for i in 0..segment.len() - 1 { + for i in 0..segment.len() - K - 1 { let kmer: &KMer = (&segment[i..i + K]) .try_into() .expect("Failed to make kmer"); // if the kmer is already in the pool, it recieves a score of zero - if !ctx.frequencies.contains_key(kmer) { + if ctx.frequencies.contains_key(kmer) { continue; } let kmer_score = compute_frequency(kmer, &collection_sample); @@ -107,7 +128,7 @@ fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> /// /// A translation of `COVER_epoch_info_t COVER_computeEpochs()` from facebook/zstd. pub fn compute_epoch_info( - params: DictParams, + params: &DictParams, max_dict_size: usize, num_kmers: usize, ) -> (usize, usize) { diff --git a/src/dictionary/frequency.rs b/src/dictionary/frequency.rs index e3035f657..ba3ca14d9 100644 --- a/src/dictionary/frequency.rs +++ b/src/dictionary/frequency.rs @@ -6,38 +6,45 @@ /// Computes a best effort guess as to how many times `pattern` occurs within /// `body`. While not 100% accurate, it will be accurate the vast majority of time pub(super) fn compute_frequency(pattern: &[u8], body: &[u8]) -> usize { + //vprintln!( + // "\tkarp-rabin: searching haystack of size {} for needle of size {} with ident {}", + // pattern.len(), + // body.len(), + // pattern[0] + pattern[1] + //); assert!(body.len() >= pattern.len()); // A prime number for modulo operations to reduce collisions (q) - const PRIME: usize = 2654435761; + const PRIME: isize = 2654435761; // Number of characters in the input alphabet (d) - const ALPHABET_SIZE: usize = 256; + const ALPHABET_SIZE: isize = 256; // Hash of input pattern (p) - let mut input_hash: usize = 0; + let mut input_hash: isize = 0; // Hash of the current window of text (t) - let mut window_hash: usize = 0; + let mut window_hash: isize = 0; // High-order digit multiplier (h) - let mut h: usize = 1; + let mut h: isize = 1; // Precompute h (?) h = (h * ALPHABET_SIZE) % PRIME; // Compute initial hash values for i in 0..pattern.len() { - input_hash = (ALPHABET_SIZE * input_hash + pattern[i] as usize) % PRIME; - window_hash = (ALPHABET_SIZE * window_hash + body[i] as usize) % PRIME; + input_hash = (ALPHABET_SIZE * input_hash + pattern[i] as isize) % PRIME; + window_hash = (ALPHABET_SIZE * window_hash + body[i] as isize) % PRIME; } let mut num_occurances = 0; for i in 0..=body.len() - pattern.len() { // There's *probably* a match if these two match if input_hash == window_hash { + vprintln!("\t\tkarp-rabin: found occurance in sample"); num_occurances += 1; } // Compute hash values for next window if i < body.len() - pattern.len() { - window_hash = (ALPHABET_SIZE * (window_hash - body[i] as usize * h) - + body[i + pattern.len()] as usize) + window_hash = (ALPHABET_SIZE * (window_hash - body[i] as isize * h) + + body[i + pattern.len()] as isize) % PRIME; } } diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs index b16ca235c..79273b95b 100644 --- a/src/dictionary/mod.rs +++ b/src/dictionary/mod.rs @@ -28,8 +28,13 @@ mod cover; mod frequency; mod reservoir; +use core::cmp::Reverse; use cover::*; -use std::io::{self, BufReader}; +use std::{ + collections::{BinaryHeap, HashMap}, + io::{self, BufReader, Read}, + vec, +}; use crate::dictionary::reservoir::create_sample; @@ -67,12 +72,52 @@ pub fn create_dict_from_source( output: &mut W, dict_size: usize, ) { + vprintln!("create_dict: creating {dict_size} byte dict from {source_size} byte source"); + let mut buffered_source = BufReader::with_capacity(5_000_000, source); + let params = DictParams { segment_size: 2048 }; - let mut buffered_source = BufReader::new(source); - let sample_size = buffered_source; - let collection_sample = create_sample(&mut buffered_source, 2 * GIBIBYTE); + let num_segments = source_size / params.segment_size as usize; // According to 4. Experiments - Varying Reservoir Sampler Thresholds, - // setting reservoir size to collection size / min{collection size / 2 * number of segments, + // setting reservoir size to collection size / min{collection size / (2 * number of segments), // 256} was effective - let (epoch_size, num_epochs) = compute_epoch_info(params, dict_size, num_kmers); + let sample_size = source_size / usize::min(source_size / (2 * num_segments), 256) / 1000; + vprintln!("create_dict: creating {sample_size} byte sample of collection"); + let collection_sample = create_sample(&mut buffered_source, sample_size); + + // A collection of segments to be used in the final dictionary. + // + // Contains the best segment from every epoch. + // Reverse is used because we want a min heap, where + // the lowest scoring items come first + let mut pool: BinaryHeap> = BinaryHeap::new(); + let (num_epochs, epoch_size) = compute_epoch_info(¶ms, dict_size, source_size / K); + vprintln!("create_dict: computed epoch info, using {num_epochs} epochs of {epoch_size} bytes"); + let mut current_epoch = vec![0; epoch_size]; + let mut epoch_counter = 0; + let mut ctx = Context { + frequencies: HashMap::with_capacity(epoch_size / K), + }; + // Score each segment in the epoch and select the highest scoring segment + // for the pool + while buffered_source + .read(&mut current_epoch) + .expect("can read input") + != 0 + { + epoch_counter += 1; + let best_segment = pick_best_segment(¶ms, &mut ctx, &collection_sample); + vprintln!( + "\tcreate_dict: epoch {epoch_counter}/{num_epochs} has best segment score {}", + best_segment.score + ); + pool.push(Reverse(best_segment)); + // Wipe frequency list for next epoch + ctx.frequencies.clear(); + } + vprintln!("create_dict: writing {} segments", pool.len()); + // Write the dictionary with the highest scoring segment last because + // closer items can be represented with a smaller offset + while let Some(segment) = pool.pop() { + output.write(&segment.0.raw).expect("can write to output"); + } } diff --git a/src/lib.rs b/src/lib.rs index 5d456bd12..49366d80d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,7 +21,7 @@ extern crate std; extern crate alloc; #[cfg(feature = "std")] -pub(crate) const VERBOSE: bool = false; +pub(crate) const VERBOSE: bool = true; macro_rules! vprintln { ($($x:expr),*) => { @@ -35,7 +35,7 @@ macro_rules! vprintln { mod bit_io; mod common; pub mod decoding; -mod dictionary; +pub mod dictionary; pub mod encoding; pub(crate) mod blocks; From 09e52d07340acdb2e13817b066e8be6e424f7258 Mon Sep 17 00:00:00 2001 From: arc Date: Sun, 10 Aug 2025 18:35:24 -0600 Subject: [PATCH 08/16] sync --- Cargo.toml | 10 ++- src/bin/zstd.rs | 2 +- src/bin/zstd_dict.rs | 133 ++++++++++++++++++++++++++++++++++++ src/dictionary/cover.rs | 14 ++-- src/dictionary/frequency.rs | 17 +++-- src/dictionary/mod.rs | 67 +++++++++++++++--- 6 files changed, 217 insertions(+), 26 deletions(-) create mode 100644 src/bin/zstd_dict.rs diff --git a/Cargo.toml b/Cargo.toml index 5d5207df7..ebd6dbdf7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,11 +21,12 @@ compiler_builtins = { version = "0.1.2", optional = true } core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" } alloc = { version = "1.0.0", optional = true, package = "rustc-std-workspace-alloc" } fastrand = "2.3.0" +zstd = { version = "0.13.2", features = ["zstdmt"]} + [dev-dependencies] criterion = "0.5" rand = { version = "0.8.5", features = ["small_rng"] } -zstd = "0.13.2" [features] default = ["hash", "std"] @@ -48,3 +49,10 @@ required-features = ["std"] [[bin]] name = "zstd_stream" required-features = ["std"] + +[[bin]] +name = "zstd_dict" +required-features = ["std"] +dependancies = [ + "zstd" +] diff --git a/src/bin/zstd.rs b/src/bin/zstd.rs index 07f616308..3ca7344f1 100644 --- a/src/bin/zstd.rs +++ b/src/bin/zstd.rs @@ -157,7 +157,7 @@ fn main() { let input = File::open("local_corpus_files/enwik9").expect("open input file"); let input_len = input.metadata().unwrap().len() as usize; let mut output = File::create("output.dict").expect("create output file"); - create_dict_from_source(input, input_len, &mut output, 5_000_000); + create_dict_from_source(input, input_len, &mut output, 1_000_000); //let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect(); //let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect(); //file_paths.remove(0); diff --git a/src/bin/zstd_dict.rs b/src/bin/zstd_dict.rs new file mode 100644 index 000000000..87041f76b --- /dev/null +++ b/src/bin/zstd_dict.rs @@ -0,0 +1,133 @@ +use ruzstd::dictionary::{create_dict_from_dir, create_dict_from_source}; +use std::fmt::Display; +use std::fs::File; +use std::io::{self, Cursor, Read, Write}; +use std::path::{Path, PathBuf}; +use std::{env::args, fs}; + +fn main() { + //let args: Vec = args().collect(); + //let input_path: &Path = args.get(1).expect("no input provided").as_ref(); + //let output_path: &Path = args.get(2).expect("no output path provided").as_ref(); + //let dict_size = args + // .get(3) + // .expect("no dict size provided (kb)") + // .parse::() + // .expect("dict size was not a valid num"); + // + //let mut output = File::create(output_path).unwrap(); + //if input_path.is_file() { + // let source = File::open(input_path).expect("unable to open input path"); + // let source_size = source.metadata().unwrap().len(); + // create_dict_from_source(source, source_size as usize, &mut output, dict_size); + //} else { + // create_dict_from_dir(input_path, &mut output, dict_size).unwrap(); + //} + print!("{}", bench("local_corpus_files/github/")); +} + +struct BenchmarkResults { + uncompressed_size: usize, + nodict_size: usize, + reference_size: usize, + our_size: usize, +} + +impl Display for BenchmarkResults { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "uncompressed: 100.00% ({})", self.uncompressed_size)?; + writeln!( + f, + "no dict: {:.2} ({})", + f64::from(self.nodict_size as u32) / f64::from(self.uncompressed_size as u32), + self.nodict_size + )?; + writeln!( + f, + "reference dict: {:.2} ({})", + f64::from(self.reference_size as u32) / f64::from(self.uncompressed_size as u32), + self.reference_size + )?; + write!( + f, + "our dict: {:.2} ({})", + f64::from(self.our_size as u32) / f64::from(self.uncompressed_size as u32), + self.our_size + )?; + Ok(()) + } +} + +fn bench>(input_path: P) -> BenchmarkResults { + // At what compression level the dicts are built with + let compression_level = 22; + + // 1. Collect a list of a path to every file in the directory into `file_paths` + println!("[bench]: collecting list of input files"); + let mut file_paths: Vec = Vec::new(); + let dir: fs::ReadDir = fs::read_dir(&input_path).expect("read input path"); + fn recurse_read(dir: fs::ReadDir, file_paths: &mut Vec) -> Result<(), io::Error> { + for entry in dir { + let entry = entry?; + if entry.file_type()?.is_dir() { + recurse_read(fs::read_dir(&entry.path())?, file_paths)?; + } else { + file_paths.push(entry.path()); + } + } + Ok(()) + } + recurse_read(dir, &mut file_paths).expect("recursing over input dir"); + + // 2. Create two dictionaries, one with our strategy, and one with theirs + println!("[bench]: creating reference dict"); + let reference_dict = + zstd::dict::from_files(file_paths.iter(), 112640).expect("create reference dict"); + let mut our_dict = Vec::with_capacity(112640); + println!("[bench]: creating our dict"); + create_dict_from_dir(input_path, &mut our_dict, 112640).expect("create our dict"); + // Open each file and compress it + let mut uncompressed_size: usize = 0; + let mut nodict_size: usize = 0; + let mut reference_size: usize = 0; + let mut our_size: usize = 0; + + let mut reference_output: Vec = Vec::with_capacity(128_000); + let mut reference_encoder = + zstd::Encoder::with_dictionary(&mut reference_output, compression_level, &reference_dict) + .unwrap(); + reference_encoder.multithread(8).unwrap(); + let mut our_output: Vec = Vec::with_capacity(128_000); + let mut our_encoder = + zstd::Encoder::with_dictionary(&mut our_output, compression_level, &our_dict).unwrap(); + our_encoder.multithread(8).unwrap(); + for (idx, path) in file_paths.iter().enumerate() { + println!("[bench]: compressing file {}/{}", idx + 1, file_paths.len()); + let mut handle = File::open(path).unwrap(); + let mut data = Vec::new(); + handle.read_to_end(&mut data); + uncompressed_size += data.len(); + // Compress with no dict + let nodict_output = zstd::encode_all(data.as_slice(), compression_level).unwrap(); + nodict_size += nodict_output.len(); + // Compress with the reference dict + reference_encoder.write_all(data.as_slice()); + reference_encoder + .do_finish() + .expect("reference encoder finishes"); + reference_size += reference_output.len(); + reference_output.clear(); + // Compress with our dict + our_encoder.write_all(data.as_slice()); + our_encoder.finish().expect("our encoder finishes"); + our_size += our_output.len(); + our_output.clear(); + } + + BenchmarkResults { + uncompressed_size, + nodict_size, + reference_size, + our_size, + } +} diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs index 6cb4d0392..407cc9471 100644 --- a/src/dictionary/cover.rs +++ b/src/dictionary/cover.rs @@ -8,7 +8,7 @@ //! https://github.com/facebook/zstd/tree/dev/lib/dictBuilder use super::DictParams; -use crate::dictionary::frequency::compute_frequency; +use crate::dictionary::frequency::estimate_frequency; use crate::dictionary::reservoir::create_sample; use core::convert::TryInto; use std::collections::{BinaryHeap, HashMap}; @@ -82,11 +82,13 @@ pub fn pick_best_segment<'epoch>( ctx: &mut Context, collection_sample: &'epoch [u8], ) -> Segment { - vprintln!("\tpick_best: picking best segment in epoch"); - let mut best_segment: &[u8] = &collection_sample[0..params.segment_size as usize]; + let mut segments = collection_sample + .chunks(params.segment_size as usize) + .peekable(); + let mut best_segment: &[u8] = &segments.peek().expect("at least one segment"); let mut top_segment_score: usize = 0; // Iterate over segments and score each segment, keeping track of the best segment - for segment in collection_sample.chunks(params.segment_size as usize) { + for segment in segments { let segment_score = score_segment(ctx, collection_sample, segment); if segment_score > top_segment_score { best_segment = segment; @@ -106,7 +108,7 @@ pub fn pick_best_segment<'epoch>( fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> usize { let mut segment_score = 0; // Determine the score of each overlapping k-mer - for i in 0..segment.len() - K - 1 { + for i in 0..(segment.len() - K - 1) { let kmer: &KMer = (&segment[i..i + K]) .try_into() .expect("Failed to make kmer"); @@ -114,7 +116,7 @@ fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> if ctx.frequencies.contains_key(kmer) { continue; } - let kmer_score = compute_frequency(kmer, &collection_sample); + let kmer_score = estimate_frequency(kmer, &collection_sample); ctx.frequencies.insert(*kmer, kmer_score); segment_score += kmer_score; } diff --git a/src/dictionary/frequency.rs b/src/dictionary/frequency.rs index ba3ca14d9..72aa03531 100644 --- a/src/dictionary/frequency.rs +++ b/src/dictionary/frequency.rs @@ -5,7 +5,7 @@ /// Computes a best effort guess as to how many times `pattern` occurs within /// `body`. While not 100% accurate, it will be accurate the vast majority of time -pub(super) fn compute_frequency(pattern: &[u8], body: &[u8]) -> usize { +pub(super) fn estimate_frequency(pattern: &[u8], body: &[u8]) -> usize { //vprintln!( // "\tkarp-rabin: searching haystack of size {} for needle of size {} with ident {}", // pattern.len(), @@ -18,7 +18,7 @@ pub(super) fn compute_frequency(pattern: &[u8], body: &[u8]) -> usize { // Number of characters in the input alphabet (d) const ALPHABET_SIZE: isize = 256; // Hash of input pattern (p) - let mut input_hash: isize = 0; + let mut pattern_hash: isize = 0; // Hash of the current window of text (t) let mut window_hash: isize = 0; // High-order digit multiplier (h) @@ -29,15 +29,14 @@ pub(super) fn compute_frequency(pattern: &[u8], body: &[u8]) -> usize { // Compute initial hash values for i in 0..pattern.len() { - input_hash = (ALPHABET_SIZE * input_hash + pattern[i] as isize) % PRIME; + pattern_hash = (ALPHABET_SIZE * pattern_hash + pattern[i] as isize) % PRIME; window_hash = (ALPHABET_SIZE * window_hash + body[i] as isize) % PRIME; } let mut num_occurances = 0; for i in 0..=body.len() - pattern.len() { // There's *probably* a match if these two match - if input_hash == window_hash { - vprintln!("\t\tkarp-rabin: found occurance in sample"); + if pattern_hash == window_hash { num_occurances += 1; } @@ -54,24 +53,24 @@ pub(super) fn compute_frequency(pattern: &[u8], body: &[u8]) -> usize { #[cfg(test)] mod tests { - use super::compute_frequency; + use super::estimate_frequency; #[test] fn dead_beef() { assert_eq!( - compute_frequency(&[0xde, 0xad], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]), + estimate_frequency(&[0xde, 0xad], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]), 2 ); } #[test] fn smallest_body() { - assert_eq!(compute_frequency(&[0x00, 0xff], &[0x00, 0xff]), 1); + assert_eq!(estimate_frequency(&[0x00, 0xff], &[0x00, 0xff]), 1); } #[test] fn no_match() { assert_eq!( - compute_frequency(&[0xff, 0xff], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]), + estimate_frequency(&[0xff, 0xff], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]), 0 ); } diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs index 79273b95b..cabdca906 100644 --- a/src/dictionary/mod.rs +++ b/src/dictionary/mod.rs @@ -7,8 +7,6 @@ //! Relative Lempel-Ziv Dictionaries", by Kewen Liao, Matthias Petri, //! Alistair Moffat, and Anthony Wirth -const GIBIBYTE: usize = 1 << 30; - // The algorithm is summarized here // 1. The text is split into "epochs", or chunks from the original source // 2. From within each epoch, we select the "segment", or 1 KiB contiguous section @@ -31,11 +29,17 @@ mod reservoir; use core::cmp::Reverse; use cover::*; use std::{ + boxed::Box, collections::{BinaryHeap, HashMap}, + dbg, + fs::{self, File}, io::{self, BufReader, Read}, - vec, + path::{Path, PathBuf}, + vec::Vec, }; +use alloc::vec; + use crate::dictionary::reservoir::create_sample; /// A set of values that are used during dictionary construction. @@ -54,6 +58,46 @@ pub struct DictParams { pub segment_size: u32, } +/// Create a dictionary +pub fn create_dict_from_dir, W: io::Write>( + path: P, + output: &mut W, + dict_size: usize, +) -> Result<(), io::Error> { + // Collect a list of a path to every file in the directory into `file_paths` + let mut file_paths: Vec = Vec::new(); + let dir: fs::ReadDir = fs::read_dir(path)?; + fn recurse_read(dir: fs::ReadDir, file_paths: &mut Vec) -> Result<(), io::Error> { + for entry in dir { + let entry = entry?; + if entry.file_type()?.is_dir() { + recurse_read(fs::read_dir(&entry.path())?, file_paths)?; + } else { + file_paths.push(entry.path()); + } + } + Ok(()) + } + recurse_read(dir, &mut file_paths)?; + + // Open each file and chain the readers together + let mut total_file_len: u64 = 0; + let mut file_handles: Vec = Vec::new(); + for path in file_paths { + let handle = File::open(path)?; + total_file_len += handle.metadata()?.len(); + file_handles.push(handle); + } + let empty_reader: Box = Box::new(io::empty()); + let chained_files = file_handles + .iter() + .fold(empty_reader, |acc, reader| Box::new(acc.chain(reader))); + + // Create a dict using the new reader + create_dict_from_source(chained_files, total_file_len as usize, output, dict_size); + Ok(()) +} + /// Read from `source` to create a dictionary of `dict_size`. The completed dictionary is written /// to `output`. /// @@ -80,7 +124,7 @@ pub fn create_dict_from_source( // According to 4. Experiments - Varying Reservoir Sampler Thresholds, // setting reservoir size to collection size / min{collection size / (2 * number of segments), // 256} was effective - let sample_size = source_size / usize::min(source_size / (2 * num_segments), 256) / 1000; + let sample_size = source_size / usize::min(source_size / (2 * num_segments), 256); vprintln!("create_dict: creating {sample_size} byte sample of collection"); let collection_sample = create_sample(&mut buffered_source, sample_size); @@ -90,18 +134,20 @@ pub fn create_dict_from_source( // Reverse is used because we want a min heap, where // the lowest scoring items come first let mut pool: BinaryHeap> = BinaryHeap::new(); - let (num_epochs, epoch_size) = compute_epoch_info(¶ms, dict_size, source_size / K); + let (_, epoch_size) = compute_epoch_info(¶ms, dict_size, source_size / K); + let num_epochs = source_size / epoch_size; vprintln!("create_dict: computed epoch info, using {num_epochs} epochs of {epoch_size} bytes"); - let mut current_epoch = vec![0; epoch_size]; + //let mut current_epoch = vec![0; epoch_size]; + let mut current_epoch = vec![0; 100]; let mut epoch_counter = 0; let mut ctx = Context { frequencies: HashMap::with_capacity(epoch_size / K), }; // Score each segment in the epoch and select the highest scoring segment // for the pool - while buffered_source + while dbg!(buffered_source .read(&mut current_epoch) - .expect("can read input") + .expect("can read input")) != 0 { epoch_counter += 1; @@ -114,7 +160,10 @@ pub fn create_dict_from_source( // Wipe frequency list for next epoch ctx.frequencies.clear(); } - vprintln!("create_dict: writing {} segments", pool.len()); + vprintln!( + "create_dict: {epoch_counter} epochs written, writing {} segments", + pool.len() + ); // Write the dictionary with the highest scoring segment last because // closer items can be represented with a smaller offset while let Some(segment) = pool.pop() { From 34e2e909c621a2f92622ae357f92f027421b5102 Mon Sep 17 00:00:00 2001 From: arc Date: Sun, 10 Aug 2025 20:05:40 -0600 Subject: [PATCH 09/16] dict: rudimentary implementation complete --- src/bin/zstd.rs | 5 ++- src/bin/zstd_dict.rs | 93 +++++++++++++++++++++++++++++++------------- 2 files changed, 68 insertions(+), 30 deletions(-) diff --git a/src/bin/zstd.rs b/src/bin/zstd.rs index 3ca7344f1..21a1feba7 100644 --- a/src/bin/zstd.rs +++ b/src/bin/zstd.rs @@ -154,10 +154,11 @@ impl Read for PercentPrintReader { } fn main() { - let input = File::open("local_corpus_files/enwik9").expect("open input file"); + let input = File::open("ik9").expect("open input file"); + //let input = File::open("local_corpus_files/enwik9").expect("open input file"); let input_len = input.metadata().unwrap().len() as usize; let mut output = File::create("output.dict").expect("create output file"); - create_dict_from_source(input, input_len, &mut output, 1_000_000); + create_dict_from_source(input, input_len, &mut output, 5_000_000); //let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect(); //let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect(); //file_paths.remove(0); diff --git a/src/bin/zstd_dict.rs b/src/bin/zstd_dict.rs index 87041f76b..ab6096887 100644 --- a/src/bin/zstd_dict.rs +++ b/src/bin/zstd_dict.rs @@ -1,4 +1,5 @@ use ruzstd::dictionary::{create_dict_from_dir, create_dict_from_source}; +use std::cell::RefCell; use std::fmt::Display; use std::fs::File; use std::io::{self, Cursor, Read, Write}; @@ -23,7 +24,7 @@ fn main() { //} else { // create_dict_from_dir(input_path, &mut output, dict_size).unwrap(); //} - print!("{}", bench("local_corpus_files/github/")); + print!("{}", bench("local_corpus_files/sat-txt-files/")); } struct BenchmarkResults { @@ -38,30 +39,42 @@ impl Display for BenchmarkResults { writeln!(f, "uncompressed: 100.00% ({})", self.uncompressed_size)?; writeln!( f, - "no dict: {:.2} ({})", - f64::from(self.nodict_size as u32) / f64::from(self.uncompressed_size as u32), + "no dict: {:.2}% of original size ({})", + f64::from(self.nodict_size as u32) / f64::from(self.uncompressed_size as u32) * 100.0, self.nodict_size )?; writeln!( f, - "reference dict: {:.2} ({})", - f64::from(self.reference_size as u32) / f64::from(self.uncompressed_size as u32), - self.reference_size + "reference dict: {:.2}% of no dict size ({} bytes smaller)", + f64::from(self.reference_size as u32) / f64::from(self.nodict_size as u32) * 100.0, + self.nodict_size - self.reference_size )?; write!( f, - "our dict: {:.2} ({})", - f64::from(self.our_size as u32) / f64::from(self.uncompressed_size as u32), - self.our_size + "our dict: {:.2}% of no dict size ({} bytes smaller)", + f64::from(self.our_size as u32) / f64::from(self.nodict_size as u32) * 100.0, + self.nodict_size - self.our_size )?; Ok(()) } } +struct Dumpster(pub usize); + +impl Write for Dumpster { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.0 += buf.len(); + Ok(buf.len()) + } + + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + fn bench>(input_path: P) -> BenchmarkResults { // At what compression level the dicts are built with - let compression_level = 22; - + let compression_level = 1; // 1. Collect a list of a path to every file in the directory into `file_paths` println!("[bench]: collecting list of input files"); let mut file_paths: Vec = Vec::new(); @@ -89,45 +102,69 @@ fn bench>(input_path: P) -> BenchmarkResults { // Open each file and compress it let mut uncompressed_size: usize = 0; let mut nodict_size: usize = 0; - let mut reference_size: usize = 0; - let mut our_size: usize = 0; - let mut reference_output: Vec = Vec::with_capacity(128_000); + let mut reference_output = Dumpster(0); let mut reference_encoder = zstd::Encoder::with_dictionary(&mut reference_output, compression_level, &reference_dict) .unwrap(); reference_encoder.multithread(8).unwrap(); - let mut our_output: Vec = Vec::with_capacity(128_000); + let mut our_output = Dumpster(0); let mut our_encoder = zstd::Encoder::with_dictionary(&mut our_output, compression_level, &our_dict).unwrap(); our_encoder.multithread(8).unwrap(); for (idx, path) in file_paths.iter().enumerate() { - println!("[bench]: compressing file {}/{}", idx + 1, file_paths.len()); + if idx % 10 == 0 { + println!("[bench]: compressing file {}/{}", idx + 1, file_paths.len()); + } let mut handle = File::open(path).unwrap(); let mut data = Vec::new(); - handle.read_to_end(&mut data); + handle.read_to_end(&mut data).unwrap(); uncompressed_size += data.len(); // Compress with no dict let nodict_output = zstd::encode_all(data.as_slice(), compression_level).unwrap(); nodict_size += nodict_output.len(); // Compress with the reference dict - reference_encoder.write_all(data.as_slice()); reference_encoder - .do_finish() - .expect("reference encoder finishes"); - reference_size += reference_output.len(); - reference_output.clear(); + .write_all(data.as_slice()) + .expect("reference writer writing"); // Compress with our dict - our_encoder.write_all(data.as_slice()); - our_encoder.finish().expect("our encoder finishes"); - our_size += our_output.len(); - our_output.clear(); + our_encoder + .write_all(data.as_slice()) + .expect("our writer writing"); } + //println!("[bench]: reading all files"); + //let mut all_files: Vec = Vec::with_capacity(1_000_000); + //for path in file_paths { + // let mut handle = File::open(path).unwrap(); + // handle + // .read_to_end(&mut all_files) + // .expect("reading input file"); + //} + //uncompressed_size = all_files.len(); + //// // Compress with no dict + //println!("[bench]: compressing using no dict"); + //let nodict_output = zstd::encode_all(all_files.as_slice(), compression_level).unwrap(); + //nodict_size = nodict_output.len(); + //drop(nodict_output); + //println!("[bench]: compressing using reference encoder"); + //reference_encoder + // .write_all(&all_files) + // .expect("writing to reference encoder"); + //println!("[bench]: compressing using our encoder"); + //our_encoder + // .write_all(&all_files) + // .expect("writing to our encoder"); + //our_encoder.do_finish().expect("our encoder finishes"); + //reference_encoder + // .do_finish() + // .expect("reference encoder finishes"); + //drop(reference_encoder); + //drop(our_encoder); BenchmarkResults { uncompressed_size, nodict_size, - reference_size, - our_size, + reference_size: reference_output.0, + our_size: our_output.0, } } From bfea46ac44a05d7e4f5d7291e2523f95e9eae4fe Mon Sep 17 00:00:00 2001 From: arc Date: Fri, 15 Aug 2025 10:35:27 -0600 Subject: [PATCH 10/16] dict: pre-clippy auto apply --- Cargo.toml | 5 +- benches/decode_all.rs | 2 +- src/bin/zstd.rs | 16 +- src/bin/zstd_dict.rs | 294 ++++++++++++++++-------------------- src/bin/zstd_stream.rs | 5 +- src/dictionary/cover.rs | 17 +-- src/dictionary/frequency.rs | 8 +- src/dictionary/mod.rs | 36 +++-- 8 files changed, 173 insertions(+), 210 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ebd6dbdf7..d66f4490f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,12 +21,12 @@ compiler_builtins = { version = "0.1.2", optional = true } core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" } alloc = { version = "1.0.0", optional = true, package = "rustc-std-workspace-alloc" } fastrand = "2.3.0" -zstd = { version = "0.13.2", features = ["zstdmt"]} [dev-dependencies] criterion = "0.5" rand = { version = "0.8.5", features = ["small_rng"] } +zstd = { version = "0.13.2", features = ["zstdmt"]} [features] default = ["hash", "std"] @@ -53,6 +53,3 @@ required-features = ["std"] [[bin]] name = "zstd_dict" required-features = ["std"] -dependancies = [ - "zstd" -] diff --git a/benches/decode_all.rs b/benches/decode_all.rs index 463113958..439986785 100644 --- a/benches/decode_all.rs +++ b/benches/decode_all.rs @@ -3,7 +3,7 @@ use ruzstd::decoding::FrameDecoder; fn criterion_benchmark(c: &mut Criterion) { let mut fr = FrameDecoder::new(); - let mut target_slice = &mut vec![0u8; 1024 * 1024 * 200]; + let target_slice = &mut vec![0u8; 1024 * 1024 * 200]; let src = include_bytes!("../decodecorpus_files/z000033.zst"); c.bench_function("decode_all_slice", |b| { diff --git a/src/bin/zstd.rs b/src/bin/zstd.rs index 21a1feba7..b55318036 100644 --- a/src/bin/zstd.rs +++ b/src/bin/zstd.rs @@ -1,17 +1,13 @@ extern crate ruzstd; use std::fs::File; -use std::io::BufReader; use std::io::Read; use std::io::Seek; use std::io::SeekFrom; use std::io::Write; -use std::time::Instant; use ruzstd::decoding::errors::FrameDecoderError; use ruzstd::decoding::errors::ReadFrameHeaderError; use ruzstd::dictionary::create_dict_from_source; -use ruzstd::encoding::CompressionLevel; -use ruzstd::encoding::FrameCompressor; struct StateTracker { bytes_used: u64, @@ -22,7 +18,7 @@ struct StateTracker { file_size: u64, old_percentage: i8, } - +#[allow(unused)] fn decompress(flags: &[String], file_paths: &[String]) { if !flags.contains(&"-d".to_owned()) { eprintln!("This zstd implementation only supports decompression. Please add a \"-d\" flag"); @@ -36,8 +32,7 @@ fn decompress(flags: &[String], file_paths: &[String]) { if flags.len() != 2 { eprintln!( - "No flags other than -d and -c are currently implemented. Flags used: {:?}", - flags + "No flags other than -d and -c are currently implemented. Flags used: {flags:?}" ); return; } @@ -45,7 +40,7 @@ fn decompress(flags: &[String], file_paths: &[String]) { let mut frame_dec = ruzstd::decoding::FrameDecoder::new(); for path in file_paths { - eprintln!("File: {}", path); + eprintln!("File: {path}"); let mut f = File::open(path).unwrap(); let mut tracker = StateTracker { @@ -132,6 +127,7 @@ fn decompress(flags: &[String], file_paths: &[String]) { } } +#[allow(unused)] struct PercentPrintReader { total: usize, counter: usize, @@ -147,7 +143,7 @@ impl Read for PercentPrintReader { if progress > self.last_percent { self.last_percent = progress; eprint!("\r"); - eprint!("{} % done", progress); + eprint!("{progress} % done"); } Ok(new_bytes) } @@ -207,7 +203,7 @@ fn do_something(data: &[u8], s: &mut StateTracker) { let percentage = (s.file_pos * 100) / s.file_size; if percentage as i8 != s.old_percentage { eprint!("\r"); - eprint!("{} % done", percentage); + eprint!("{percentage} % done"); s.old_percentage = percentage as i8; } } diff --git a/src/bin/zstd_dict.rs b/src/bin/zstd_dict.rs index ab6096887..f502700e7 100644 --- a/src/bin/zstd_dict.rs +++ b/src/bin/zstd_dict.rs @@ -1,170 +1,140 @@ use ruzstd::dictionary::{create_dict_from_dir, create_dict_from_source}; -use std::cell::RefCell; -use std::fmt::Display; use std::fs::File; -use std::io::{self, Cursor, Read, Write}; -use std::path::{Path, PathBuf}; -use std::{env::args, fs}; +use std::path::Path; +use std::env::args; fn main() { - //let args: Vec = args().collect(); - //let input_path: &Path = args.get(1).expect("no input provided").as_ref(); - //let output_path: &Path = args.get(2).expect("no output path provided").as_ref(); - //let dict_size = args - // .get(3) - // .expect("no dict size provided (kb)") - // .parse::() - // .expect("dict size was not a valid num"); - // - //let mut output = File::create(output_path).unwrap(); - //if input_path.is_file() { - // let source = File::open(input_path).expect("unable to open input path"); - // let source_size = source.metadata().unwrap().len(); - // create_dict_from_source(source, source_size as usize, &mut output, dict_size); - //} else { - // create_dict_from_dir(input_path, &mut output, dict_size).unwrap(); - //} - print!("{}", bench("local_corpus_files/sat-txt-files/")); -} - -struct BenchmarkResults { - uncompressed_size: usize, - nodict_size: usize, - reference_size: usize, - our_size: usize, -} - -impl Display for BenchmarkResults { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - writeln!(f, "uncompressed: 100.00% ({})", self.uncompressed_size)?; - writeln!( - f, - "no dict: {:.2}% of original size ({})", - f64::from(self.nodict_size as u32) / f64::from(self.uncompressed_size as u32) * 100.0, - self.nodict_size - )?; - writeln!( - f, - "reference dict: {:.2}% of no dict size ({} bytes smaller)", - f64::from(self.reference_size as u32) / f64::from(self.nodict_size as u32) * 100.0, - self.nodict_size - self.reference_size - )?; - write!( - f, - "our dict: {:.2}% of no dict size ({} bytes smaller)", - f64::from(self.our_size as u32) / f64::from(self.nodict_size as u32) * 100.0, - self.nodict_size - self.our_size - )?; - Ok(()) - } -} - -struct Dumpster(pub usize); - -impl Write for Dumpster { - fn write(&mut self, buf: &[u8]) -> io::Result { - self.0 += buf.len(); - Ok(buf.len()) - } + let args: Vec = args().collect(); + let input_path: &Path = args.get(1).expect("no input provided").as_ref(); + let output_path: &Path = args.get(2).expect("no output path provided").as_ref(); + let dict_size = args + .get(3) + .expect("no dict size provided (kb)") + .parse::() + .expect("dict size was not a valid num"); - fn flush(&mut self) -> io::Result<()> { - Ok(()) + let mut output = File::create(output_path).unwrap(); + if input_path.is_file() { + let source = File::open(input_path).expect("unable to open input path"); + let source_size = source.metadata().unwrap().len(); + create_dict_from_source(source, source_size as usize, &mut output, dict_size); + } else { + create_dict_from_dir(input_path, &mut output, dict_size).unwrap(); } } -fn bench>(input_path: P) -> BenchmarkResults { - // At what compression level the dicts are built with - let compression_level = 1; - // 1. Collect a list of a path to every file in the directory into `file_paths` - println!("[bench]: collecting list of input files"); - let mut file_paths: Vec = Vec::new(); - let dir: fs::ReadDir = fs::read_dir(&input_path).expect("read input path"); - fn recurse_read(dir: fs::ReadDir, file_paths: &mut Vec) -> Result<(), io::Error> { - for entry in dir { - let entry = entry?; - if entry.file_type()?.is_dir() { - recurse_read(fs::read_dir(&entry.path())?, file_paths)?; - } else { - file_paths.push(entry.path()); - } - } - Ok(()) - } - recurse_read(dir, &mut file_paths).expect("recursing over input dir"); - - // 2. Create two dictionaries, one with our strategy, and one with theirs - println!("[bench]: creating reference dict"); - let reference_dict = - zstd::dict::from_files(file_paths.iter(), 112640).expect("create reference dict"); - let mut our_dict = Vec::with_capacity(112640); - println!("[bench]: creating our dict"); - create_dict_from_dir(input_path, &mut our_dict, 112640).expect("create our dict"); - // Open each file and compress it - let mut uncompressed_size: usize = 0; - let mut nodict_size: usize = 0; - - let mut reference_output = Dumpster(0); - let mut reference_encoder = - zstd::Encoder::with_dictionary(&mut reference_output, compression_level, &reference_dict) - .unwrap(); - reference_encoder.multithread(8).unwrap(); - let mut our_output = Dumpster(0); - let mut our_encoder = - zstd::Encoder::with_dictionary(&mut our_output, compression_level, &our_dict).unwrap(); - our_encoder.multithread(8).unwrap(); - for (idx, path) in file_paths.iter().enumerate() { - if idx % 10 == 0 { - println!("[bench]: compressing file {}/{}", idx + 1, file_paths.len()); - } - let mut handle = File::open(path).unwrap(); - let mut data = Vec::new(); - handle.read_to_end(&mut data).unwrap(); - uncompressed_size += data.len(); - // Compress with no dict - let nodict_output = zstd::encode_all(data.as_slice(), compression_level).unwrap(); - nodict_size += nodict_output.len(); - // Compress with the reference dict - reference_encoder - .write_all(data.as_slice()) - .expect("reference writer writing"); - // Compress with our dict - our_encoder - .write_all(data.as_slice()) - .expect("our writer writing"); - } - //println!("[bench]: reading all files"); - //let mut all_files: Vec = Vec::with_capacity(1_000_000); - //for path in file_paths { - // let mut handle = File::open(path).unwrap(); - // handle - // .read_to_end(&mut all_files) - // .expect("reading input file"); - //} - //uncompressed_size = all_files.len(); - //// // Compress with no dict - //println!("[bench]: compressing using no dict"); - //let nodict_output = zstd::encode_all(all_files.as_slice(), compression_level).unwrap(); - //nodict_size = nodict_output.len(); - //drop(nodict_output); - //println!("[bench]: compressing using reference encoder"); - //reference_encoder - // .write_all(&all_files) - // .expect("writing to reference encoder"); - //println!("[bench]: compressing using our encoder"); - //our_encoder - // .write_all(&all_files) - // .expect("writing to our encoder"); - //our_encoder.do_finish().expect("our encoder finishes"); - //reference_encoder - // .do_finish() - // .expect("reference encoder finishes"); - //drop(reference_encoder); - //drop(our_encoder); - - BenchmarkResults { - uncompressed_size, - nodict_size, - reference_size: reference_output.0, - our_size: our_output.0, - } -} +//struct BenchmarkResults { +// pub uncompressed_size: usize, +// pub nodict_size: usize, +// pub reference_size: usize, +// pub our_size: usize, +//} +// +//impl Display for BenchmarkResults { +// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +// writeln!(f, "uncompressed: 100.00% ({})", self.uncompressed_size)?; +// writeln!( +// f, +// "no dict: {:.2}% of original size ({})", +// f64::from(self.nodict_size as u32) / f64::from(self.uncompressed_size as u32) * 100.0, +// self.nodict_size +// )?; +// writeln!( +// f, +// "reference dict: {:.2}% of no dict size ({} bytes smaller)", +// f64::from(self.reference_size as u32) / f64::from(self.nodict_size as u32) * 100.0, +// self.nodict_size - self.reference_size +// )?; +// write!( +// f, +// "our dict: {:.2}% of no dict size ({} bytes smaller)", +// f64::from(self.our_size as u32) / f64::from(self.nodict_size as u32) * 100.0, +// self.nodict_size - self.our_size +// )?; +// Ok(()) +// } +//} +// +//struct Dumpster(pub usize); +// +//impl Write for Dumpster { +// fn write(&mut self, buf: &[u8]) -> io::Result { +// self.0 += buf.len(); +// Ok(buf.len()) +// } +// +// fn flush(&mut self) -> io::Result<()> { +// Ok(()) +// } +//} +// +///// Compares compression ratios achieved with a dictionary +//#[allow(unused)] +//fn bench>(input_path: P) -> BenchmarkResults { +// // At what compression level the dicts are built with +// let compression_level = 1; +// // 1. Collect a list of a path to every file in the directory into `file_paths` +// println!("[bench]: collecting list of input files"); +// let mut file_paths: Vec = Vec::new(); +// let dir: fs::ReadDir = fs::read_dir(&input_path).expect("read input path"); +// fn recurse_read(dir: fs::ReadDir, file_paths: &mut Vec) -> Result<(), io::Error> { +// for entry in dir { +// let entry = entry?; +// if entry.file_type()?.is_dir() { +// recurse_read(fs::read_dir(&entry.path())?, file_paths)?; +// } else { +// file_paths.push(entry.path()); +// } +// } +// Ok(()) +// } +// recurse_read(dir, &mut file_paths).expect("recursing over input dir"); +// +// // 2. Create two dictionaries, one with our strategy, and one with theirs +// println!("[bench]: creating reference dict"); +// let reference_dict = +// zstd::dict::from_files(file_paths.iter(), 112640).expect("create reference dict"); +// let mut our_dict = Vec::with_capacity(112640); +// println!("[bench]: creating our dict"); +// create_dict_from_dir(input_path, &mut our_dict, 112640).expect("create our dict"); +// // Open each file and compress it +// let mut uncompressed_size: usize = 0; +// let mut nodict_size: usize = 0; +// +// let mut reference_output = Dumpster(0); +// let mut reference_encoder = +// zstd::Encoder::with_dictionary(&mut reference_output, compression_level, &reference_dict) +// .unwrap(); +// reference_encoder.multithread(8).unwrap(); +// let mut our_output = Dumpster(0); +// let mut our_encoder = +// zstd::Encoder::with_dictionary(&mut our_output, compression_level, &our_dict).unwrap(); +// our_encoder.multithread(8).unwrap(); +// for (idx, path) in file_paths.iter().enumerate() { +// if idx % 10 == 0 { +// println!("[bench]: compressing file {}/{}", idx + 1, file_paths.len()); +// } +// let mut handle = File::open(path).unwrap(); +// let mut data = Vec::new(); +// handle.read_to_end(&mut data).unwrap(); +// uncompressed_size += data.len(); +// // Compress with no dict +// let nodict_output = zstd::encode_all(data.as_slice(), compression_level).unwrap(); +// nodict_size += nodict_output.len(); +// // Compress with the reference dict +// reference_encoder +// .write_all(data.as_slice()) +// .expect("reference writer writing"); +// // Compress with our dict +// our_encoder +// .write_all(data.as_slice()) +// .expect("our writer writing"); +// } +// +// BenchmarkResults { +// uncompressed_size, +// nodict_size, +// reference_size: reference_output.0, +// our_size: our_output.0, +// } +//} diff --git a/src/bin/zstd_stream.rs b/src/bin/zstd_stream.rs index 609530e5b..521abf464 100644 --- a/src/bin/zstd_stream.rs +++ b/src/bin/zstd_stream.rs @@ -19,14 +19,13 @@ fn main() { if flags.len() != 2 { eprintln!( - "No flags other than -d and -c are currently implemented. Flags used: {:?}", - flags + "No flags other than -d and -c are currently implemented. Flags used: {flags:?}" ); return; } for path in file_paths { - eprintln!("File: {}", path); + eprintln!("File: {path}"); let f = File::open(path).unwrap(); let mut buf_read = std::io::BufReader::new(f); diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs index 407cc9471..9f2c94922 100644 --- a/src/dictionary/cover.rs +++ b/src/dictionary/cover.rs @@ -1,4 +1,4 @@ -//! An implementation of the dictionary generation algorithm +//! An implementation of the local maximum coverage algorithm //! described in the paper "Effective Construction of Relative Lempel-Ziv Dictionaries", //! by Liao, Petri, Moffat, and Wirth, published under the University of Melbourne. //! @@ -9,14 +9,13 @@ use super::DictParams; use crate::dictionary::frequency::estimate_frequency; -use crate::dictionary::reservoir::create_sample; use core::convert::TryInto; -use std::collections::{BinaryHeap, HashMap}; -use std::io::{Cursor, Read}; +use std::collections::HashMap; use std::vec::Vec; /// The size of each k-mer pub(super) const K: usize = 16; + ///As found under "4: Experiments - Varying k-mer Size" in the original paper, /// "when k = 16, across all our text collections, there is a reasonable spread" /// @@ -67,14 +66,6 @@ pub struct Context { pub frequencies: HashMap, } -impl Context { - fn new() -> Self { - Self { - frequencies: HashMap::new(), - } - } -} - /// Returns the highest scoring segment in an epoch /// as a slice of that epoch. pub fn pick_best_segment<'epoch>( @@ -104,7 +95,7 @@ pub fn pick_best_segment<'epoch>( /// Given a segment, compute the score (or usefulness) of that segment against the entire epoch. /// -/// `score_segment` modifies ctx.frequencies. +/// `score_segment` modifies `ctx.frequencies`. fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> usize { let mut segment_score = 0; // Determine the score of each overlapping k-mer diff --git a/src/dictionary/frequency.rs b/src/dictionary/frequency.rs index 72aa03531..074e73839 100644 --- a/src/dictionary/frequency.rs +++ b/src/dictionary/frequency.rs @@ -5,13 +5,7 @@ /// Computes a best effort guess as to how many times `pattern` occurs within /// `body`. While not 100% accurate, it will be accurate the vast majority of time -pub(super) fn estimate_frequency(pattern: &[u8], body: &[u8]) -> usize { - //vprintln!( - // "\tkarp-rabin: searching haystack of size {} for needle of size {} with ident {}", - // pattern.len(), - // body.len(), - // pattern[0] + pattern[1] - //); +pub fn estimate_frequency(pattern: &[u8], body: &[u8]) -> usize { assert!(body.len() >= pattern.len()); // A prime number for modulo operations to reduce collisions (q) const PRIME: isize = 2654435761; diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs index cabdca906..bb607b4f0 100644 --- a/src/dictionary/mod.rs +++ b/src/dictionary/mod.rs @@ -19,13 +19,15 @@ // reservoir of n/t, where `t` is the desired number of occurances // we want the most common k-mers to have // - Have the ability to estimate -// the frequency of a given k-mer: f(w: k-mer) calculates +// the frequency of a given k-mer: `f(w: k-mer)` calculates // the frequency of w in the reservoir using a rolling karp-rabin hash // - The score of a segment is the sum of `f(w)` called on every kmer within the segment mod cover; mod frequency; mod reservoir; +use crate::dictionary::reservoir::create_sample; +use alloc::vec; use core::cmp::Reverse; use cover::*; use std::{ @@ -38,13 +40,10 @@ use std::{ vec::Vec, }; -use alloc::vec; - -use crate::dictionary::reservoir::create_sample; - /// A set of values that are used during dictionary construction. /// /// Changing these values can improve the resulting dictionary size for certain datasets. +// TODO: move `k` here. pub struct DictParams { /// Segment size. /// @@ -58,7 +57,22 @@ pub struct DictParams { pub segment_size: u32, } -/// Create a dictionary +/// Creates a dictionary, training off of every file in this directory and all +/// sub-directories. +/// +/// The resulting dictionary will be approxamitely `dict_size` or less, and written to `output`. +/// +/// # Errors +/// This function returns `Ok(())` if the dictionary was created successfully, and an +/// `Err(io::Error)` if an error was encountered reading the input directory. +/// +/// # Examples +/// ```no_run +/// // Create a roughly 1mb dictionary, training off of file in `sample_files` +/// let input_folder = "sample_files/"; +/// let output = File::create("output.dict"); +/// ruzstd::dict::create_dict_from_dir(input_folder, &mut output, 1_000_000); +/// ``` pub fn create_dict_from_dir, W: io::Write>( path: P, output: &mut W, @@ -103,8 +117,8 @@ pub fn create_dict_from_dir, W: io::Write>( /// /// - `source` will be used as training data for the entire dictionary. /// - `source_size` influences how the data is divided and sampled and is measured -/// in bytes. While this does not need to be exact, estimates should attempt to be -/// larger than the actual collection size. +/// in bytes. While this does not need to be exact, estimates should attempt to be +/// larger than the actual collection size. /// - `output` is where the completed dictionary will be written. /// - `dict_size` determines how large the complete dictionary should be. The completed /// dictionary will be this size or smaller. @@ -117,7 +131,7 @@ pub fn create_dict_from_source( dict_size: usize, ) { vprintln!("create_dict: creating {dict_size} byte dict from {source_size} byte source"); - let mut buffered_source = BufReader::with_capacity(5_000_000, source); + let mut buffered_source = BufReader::with_capacity(128_000, source); let params = DictParams { segment_size: 2048 }; let num_segments = source_size / params.segment_size as usize; @@ -167,6 +181,8 @@ pub fn create_dict_from_source( // Write the dictionary with the highest scoring segment last because // closer items can be represented with a smaller offset while let Some(segment) = pool.pop() { - output.write(&segment.0.raw).expect("can write to output"); + output + .write_all(&segment.0.raw) + .expect("can write to output"); } } From e17156d100cedde646ca0a7bf128d2a2a20c1576 Mon Sep 17 00:00:00 2001 From: arc Date: Tue, 19 Aug 2025 11:00:12 -0600 Subject: [PATCH 11/16] refactor: specify raw content dictionary creation --- src/dictionary/mod.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs index bb607b4f0..28b72932d 100644 --- a/src/dictionary/mod.rs +++ b/src/dictionary/mod.rs @@ -57,7 +57,7 @@ pub struct DictParams { pub segment_size: u32, } -/// Creates a dictionary, training off of every file in this directory and all +/// Creates a "raw content" dictionary, training off of every file in this directory and all /// sub-directories. /// /// The resulting dictionary will be approxamitely `dict_size` or less, and written to `output`. @@ -73,7 +73,7 @@ pub struct DictParams { /// let output = File::create("output.dict"); /// ruzstd::dict::create_dict_from_dir(input_folder, &mut output, 1_000_000); /// ``` -pub fn create_dict_from_dir, W: io::Write>( +pub fn create_raw_dict_from_dir, W: io::Write>( path: P, output: &mut W, dict_size: usize, @@ -108,12 +108,12 @@ pub fn create_dict_from_dir, W: io::Write>( .fold(empty_reader, |acc, reader| Box::new(acc.chain(reader))); // Create a dict using the new reader - create_dict_from_source(chained_files, total_file_len as usize, output, dict_size); + create_raw_dict_from_source(chained_files, total_file_len as usize, output, dict_size); Ok(()) } -/// Read from `source` to create a dictionary of `dict_size`. The completed dictionary is written -/// to `output`. +/// Read from `source` to create a "raw content" dictionary of `dict_size`. +/// The completed dictionary is written to `output`. /// /// - `source` will be used as training data for the entire dictionary. /// - `source_size` influences how the data is divided and sampled and is measured @@ -124,7 +124,7 @@ pub fn create_dict_from_dir, W: io::Write>( /// dictionary will be this size or smaller. /// /// This function uses `BufRead` internally, the provided reader need not be buffered. -pub fn create_dict_from_source( +pub fn create_raw_dict_from_source( source: R, source_size: usize, output: &mut W, From 5213ef79b4b7165e7454703368aabc4ef1969bc6 Mon Sep 17 00:00:00 2001 From: arc Date: Tue, 19 Aug 2025 11:11:02 -0600 Subject: [PATCH 12/16] lint: fixing clippy --- src/bin/zstd.rs | 86 +++++++++++++++++----------------- src/bin/zstd_dict.rs | 6 +-- src/bit_io/bit_reader.rs | 6 +-- src/encoding/levels/default.rs | 27 ----------- src/encoding/levels/mod.rs | 2 - 5 files changed, 47 insertions(+), 80 deletions(-) delete mode 100644 src/encoding/levels/default.rs diff --git a/src/bin/zstd.rs b/src/bin/zstd.rs index b55318036..cc9762afb 100644 --- a/src/bin/zstd.rs +++ b/src/bin/zstd.rs @@ -1,13 +1,16 @@ extern crate ruzstd; use std::fs::File; +use std::io::BufReader; use std::io::Read; use std::io::Seek; use std::io::SeekFrom; use std::io::Write; +use std::time::Instant; use ruzstd::decoding::errors::FrameDecoderError; use ruzstd::decoding::errors::ReadFrameHeaderError; -use ruzstd::dictionary::create_dict_from_source; +use ruzstd::encoding::CompressionLevel; +use ruzstd::encoding::FrameCompressor; struct StateTracker { bytes_used: u64, @@ -150,49 +153,44 @@ impl Read for PercentPrintReader { } fn main() { - let input = File::open("ik9").expect("open input file"); - //let input = File::open("local_corpus_files/enwik9").expect("open input file"); - let input_len = input.metadata().unwrap().len() as usize; - let mut output = File::create("output.dict").expect("create output file"); - create_dict_from_source(input, input_len, &mut output, 5_000_000); - //let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect(); - //let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect(); - //file_paths.remove(0); - // - //if flags.is_empty() { - // let mut encoder = FrameCompressor::new(CompressionLevel::Fastest); - // encoder.set_drain(Vec::new()); - // - // for path in file_paths { - // let start_instant = Instant::now(); - // let file = std::fs::File::open(&path).unwrap(); - // let input_len = file.metadata().unwrap().len() as usize; - // let file = PercentPrintReader { - // reader: BufReader::new(file), - // total: input_len, - // counter: 0, - // last_percent: 0, - // }; - // encoder.set_source(file); - // encoder.compress(); - // let mut output: Vec<_> = encoder.take_drain().unwrap(); - // println!( - // "Compressed {path:} from {} to {} ({}%) took {}ms", - // input_len, - // output.len(), - // if input_len == 0 { - // 0 - // } else { - // output.len() * 100 / input_len - // }, - // start_instant.elapsed().as_millis() - // ); - // output.clear(); - // encoder.set_drain(output); - // } - //} else { - // decompress(&flags, &file_paths); - //} + let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect(); + let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect(); + file_paths.remove(0); + + if flags.is_empty() { + let mut encoder = FrameCompressor::new(CompressionLevel::Fastest); + encoder.set_drain(Vec::new()); + + for path in file_paths { + let start_instant = Instant::now(); + let file = std::fs::File::open(&path).unwrap(); + let input_len = file.metadata().unwrap().len() as usize; + let file = PercentPrintReader { + reader: BufReader::new(file), + total: input_len, + counter: 0, + last_percent: 0, + }; + encoder.set_source(file); + encoder.compress(); + let mut output: Vec<_> = encoder.take_drain().unwrap(); + println!( + "Compressed {path:} from {} to {} ({}%) took {}ms", + input_len, + output.len(), + if input_len == 0 { + 0 + } else { + output.len() * 100 / input_len + }, + start_instant.elapsed().as_millis() + ); + output.clear(); + encoder.set_drain(output); + } + } else { + decompress(&flags, &file_paths); + } } fn do_something(data: &[u8], s: &mut StateTracker) { diff --git a/src/bin/zstd_dict.rs b/src/bin/zstd_dict.rs index f502700e7..25f24269b 100644 --- a/src/bin/zstd_dict.rs +++ b/src/bin/zstd_dict.rs @@ -1,4 +1,4 @@ -use ruzstd::dictionary::{create_dict_from_dir, create_dict_from_source}; +use ruzstd::dictionary::{create_raw_dict_from_dir, create_raw_dict_from_source}; use std::fs::File; use std::path::Path; use std::env::args; @@ -17,9 +17,9 @@ fn main() { if input_path.is_file() { let source = File::open(input_path).expect("unable to open input path"); let source_size = source.metadata().unwrap().len(); - create_dict_from_source(source, source_size as usize, &mut output, dict_size); + create_raw_dict_from_source(source, source_size as usize, &mut output, dict_size); } else { - create_dict_from_dir(input_path, &mut output, dict_size).unwrap(); + create_raw_dict_from_dir(input_path, &mut output, dict_size).unwrap(); } } diff --git a/src/bit_io/bit_reader.rs b/src/bit_io/bit_reader.rs index 4e88948a0..c8987250e 100644 --- a/src/bit_io/bit_reader.rs +++ b/src/bit_io/bit_reader.rs @@ -116,8 +116,7 @@ impl core::fmt::Display for GetBitsError { } => { write!( f, - "Cant serve this request. The reader is limited to {} bits, requested {} bits", - limit, num_requested_bits, + "Cant serve this request. The reader is limited to {limit} bits, requested {num_requested_bits} bits" ) } GetBitsError::NotEnoughRemainingBits { @@ -126,8 +125,7 @@ impl core::fmt::Display for GetBitsError { } => { write!( f, - "Can\'t read {} bits, only have {} bits left", - requested, remaining, + "Can\'t read {requested} bits, only have {remaining} bits left" ) } } diff --git a/src/encoding/levels/default.rs b/src/encoding/levels/default.rs deleted file mode 100644 index 4b83bd246..000000000 --- a/src/encoding/levels/default.rs +++ /dev/null @@ -1,27 +0,0 @@ -use crate::{ - common::MAX_BLOCK_SIZE, - encoding::{ - block_header::BlockHeader, blocks::compress_block, frame_compressor::CompressState, Matcher, - }, -}; -use alloc::vec::Vec; - -/// Compresses a single block at [`crate::encoding::CompressionLevel::Default`]. -/// -/// # Parameters -/// - `state`: [`CompressState`] so the compressor can refer to data prior to -/// the start of this block -/// - `last_block`: Whether or not this block is going to be the last block in the frame -/// (needed because this info is written into the block header) -/// - `uncompressed_data`: A block's worth of uncompressed data, taken from the -/// larger input -/// - `output`: As `uncompressed_data` is compressed, it's appended to `output`. -#[inline] -pub fn compress_default( - state: &mut CompressState, - last_block: bool, - uncompressed_data: Vec, - output: &mut Vec, -) { - let block_size = uncompressed_data.len() as u32; -} diff --git a/src/encoding/levels/mod.rs b/src/encoding/levels/mod.rs index ce6f66bd8..fb39caaf8 100644 --- a/src/encoding/levels/mod.rs +++ b/src/encoding/levels/mod.rs @@ -1,4 +1,2 @@ mod fastest; pub use fastest::compress_fastest; -mod default; -pub use default::compress_default; From a710b220422bc65dcf4f1a7f92fed1adbef3767b Mon Sep 17 00:00:00 2001 From: arc Date: Tue, 19 Aug 2025 11:51:53 -0600 Subject: [PATCH 13/16] docs: update readme.md to include dict builder --- .gitignore | 1 + Cargo.toml | 5 +-- Readme.md | 37 +++++++++++++++++--- src/bin/zstd.rs | 68 ++++++++++++++++++------------------- src/bin/zstd_dict.rs | 2 +- src/bin/zstd_stream.rs | 4 +-- src/dictionary/cover.rs | 18 ++++------ src/dictionary/mod.rs | 6 ++-- src/dictionary/reservoir.rs | 5 ++- src/lib.rs | 1 + src/tests/decode_corpus.rs | 3 +- src/tests/dict_test.rs | 5 +-- src/tests/mod.rs | 13 ++++--- 13 files changed, 97 insertions(+), 71 deletions(-) diff --git a/.gitignore b/.gitignore index 118714d18..6cb4d5724 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ **/*.rs.bk Cargo.lock /local_corpus_files +/local_dict_corpus_files /orig-zstd fuzz_decodecorpus perf.data* diff --git a/Cargo.toml b/Cargo.toml index d66f4490f..996e2478a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ categories = ["compression"] [dependencies] twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"], optional = true } -# Internal feature, only used when building as part of libstd, not part of the +# Internal feature, only used when building as part of libstd, not part of theea # stable interface of this crate. compiler_builtins = { version = "0.1.2", optional = true } core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" } @@ -33,6 +33,7 @@ default = ["hash", "std"] hash = ["dep:twox-hash"] fuzz_exports = [] std = [] +dict_builder = ["std"] # Internal feature, only used when building as part of libstd, not part of the # stable interface of this crate. @@ -52,4 +53,4 @@ required-features = ["std"] [[bin]] name = "zstd_dict" -required-features = ["std"] +required-features = ["std", "dict_builder"] diff --git a/Readme.md b/Readme.md index 3281a3d02..c96e787b4 100644 --- a/Readme.md +++ b/Readme.md @@ -15,8 +15,20 @@ This crate is currently actively maintained. # Current Status -Feature complete on the decoder side. +## Decompression +The `decoding` module provides a complete +implementation of a Zstandard decompressor. + +In terms of speed, `ruzstd` is behind the original C implementation +which has a rust binding located [here](https://github.com/gyscos/zstd-rs). +Measuring with the 'time' utility the original zstd and my decoder both +decoding the same enwik9.zst file from a ramfs, my decoder is about 3.5 +times slower. Enwik9 is highly compressible, for less compressible data +(like a ubuntu installation .iso) my decoder comes close to only being +1.4 times slower. + +## Compression On the compression side: - Support for generating compressed blocks at any compression level - [x] Uncompressed @@ -24,13 +36,28 @@ On the compression side: - [ ] Default (roughly level 3) - [ ] Better (roughly level 7) - [ ] Best (roughly level 11) -- [ ] Checksums +- [x] Checksums - [ ] Dictionaries -## Speed -In terms of speed this library is behind the original C implementation which has a rust binding located [here](https://github.com/gyscos/zstd-rs). +## Dictionary Generation +When the `dict_builder` feature is enabled, the `dictionary` module +provides the ability to create new dictionaries. + +On the `github-users` sample set, our implementation benchmarks within +0.2% of the official implementation (as of commit +`09e52d07340acdb2e13817b066e8be6e424f7258`): +``` +uncompressed: 100.00% (7484607 bytes) +no dict: 34.99% of original size (2618872 bytes) +reference dict: 16.16% of no dict size (2195672 bytes smaller) +our dict: 16.28% of no dict size (2192400 bytes smaller) +``` + +The dictionary generator only provides support for creating "raw +content" dictionaries. Tagged dictionaries are currently unsupported. -Measuring with the 'time' utility the original zstd and my decoder both decoding the same enwik9.zst file from a ramfs, my decoder is about 3.5 times slower. Enwik9 is highly compressible, for less compressible data (like a ubuntu installation .iso) my decoder comes close to only being 1.4 times slower. +See +for clarification. # How can you use it? diff --git a/src/bin/zstd.rs b/src/bin/zstd.rs index cc9762afb..b90dcb690 100644 --- a/src/bin/zstd.rs +++ b/src/bin/zstd.rs @@ -34,9 +34,7 @@ fn decompress(flags: &[String], file_paths: &[String]) { } if flags.len() != 2 { - eprintln!( - "No flags other than -d and -c are currently implemented. Flags used: {flags:?}" - ); + eprintln!("No flags other than -d and -c are currently implemented. Flags used: {flags:?}"); return; } @@ -156,40 +154,40 @@ fn main() { let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect(); let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect(); file_paths.remove(0); - + if flags.is_empty() { - let mut encoder = FrameCompressor::new(CompressionLevel::Fastest); - encoder.set_drain(Vec::new()); - - for path in file_paths { - let start_instant = Instant::now(); - let file = std::fs::File::open(&path).unwrap(); - let input_len = file.metadata().unwrap().len() as usize; - let file = PercentPrintReader { - reader: BufReader::new(file), - total: input_len, - counter: 0, - last_percent: 0, - }; - encoder.set_source(file); - encoder.compress(); - let mut output: Vec<_> = encoder.take_drain().unwrap(); - println!( - "Compressed {path:} from {} to {} ({}%) took {}ms", - input_len, - output.len(), - if input_len == 0 { - 0 - } else { - output.len() * 100 / input_len - }, - start_instant.elapsed().as_millis() - ); - output.clear(); - encoder.set_drain(output); - } + let mut encoder = FrameCompressor::new(CompressionLevel::Fastest); + encoder.set_drain(Vec::new()); + + for path in file_paths { + let start_instant = Instant::now(); + let file = std::fs::File::open(&path).unwrap(); + let input_len = file.metadata().unwrap().len() as usize; + let file = PercentPrintReader { + reader: BufReader::new(file), + total: input_len, + counter: 0, + last_percent: 0, + }; + encoder.set_source(file); + encoder.compress(); + let mut output: Vec<_> = encoder.take_drain().unwrap(); + println!( + "Compressed {path:} from {} to {} ({}%) took {}ms", + input_len, + output.len(), + if input_len == 0 { + 0 + } else { + output.len() * 100 / input_len + }, + start_instant.elapsed().as_millis() + ); + output.clear(); + encoder.set_drain(output); + } } else { - decompress(&flags, &file_paths); + decompress(&flags, &file_paths); } } diff --git a/src/bin/zstd_dict.rs b/src/bin/zstd_dict.rs index 25f24269b..6ec26c92e 100644 --- a/src/bin/zstd_dict.rs +++ b/src/bin/zstd_dict.rs @@ -1,7 +1,7 @@ use ruzstd::dictionary::{create_raw_dict_from_dir, create_raw_dict_from_source}; +use std::env::args; use std::fs::File; use std::path::Path; -use std::env::args; fn main() { let args: Vec = args().collect(); diff --git a/src/bin/zstd_stream.rs b/src/bin/zstd_stream.rs index 521abf464..d22bac8c4 100644 --- a/src/bin/zstd_stream.rs +++ b/src/bin/zstd_stream.rs @@ -18,9 +18,7 @@ fn main() { } if flags.len() != 2 { - eprintln!( - "No flags other than -d and -c are currently implemented. Flags used: {flags:?}" - ); + eprintln!("No flags other than -d and -c are currently implemented. Flags used: {flags:?}"); return; } diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs index 9f2c94922..093b8b656 100644 --- a/src/dictionary/cover.rs +++ b/src/dictionary/cover.rs @@ -2,10 +2,10 @@ //! described in the paper "Effective Construction of Relative Lempel-Ziv Dictionaries", //! by Liao, Petri, Moffat, and Wirth, published under the University of Melbourne. //! -//! See: https://people.eng.unimelb.edu.au/ammoffat/abstracts/lpmw16www.pdf +//! See: //! //! Facebook's implementation was also used as a reference. -//! https://github.com/facebook/zstd/tree/dev/lib/dictBuilder +//! use super::DictParams; use crate::dictionary::frequency::estimate_frequency; @@ -43,11 +43,7 @@ impl PartialEq for Segment { impl PartialOrd for Segment { fn partial_cmp(&self, other: &Self) -> Option { - match self.score.partial_cmp(&other.score) { - Some(core::cmp::Ordering::Equal) => {} - ord => return ord, - } - self.score.partial_cmp(&other.score) + Some(self.cmp(other)) } } @@ -68,15 +64,15 @@ pub struct Context { /// Returns the highest scoring segment in an epoch /// as a slice of that epoch. -pub fn pick_best_segment<'epoch>( +pub fn pick_best_segment( params: &DictParams, ctx: &mut Context, - collection_sample: &'epoch [u8], + collection_sample: &'_ [u8], ) -> Segment { let mut segments = collection_sample .chunks(params.segment_size as usize) .peekable(); - let mut best_segment: &[u8] = &segments.peek().expect("at least one segment"); + let mut best_segment: &[u8] = segments.peek().expect("at least one segment"); let mut top_segment_score: usize = 0; // Iterate over segments and score each segment, keeping track of the best segment for segment in segments { @@ -107,7 +103,7 @@ fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> if ctx.frequencies.contains_key(kmer) { continue; } - let kmer_score = estimate_frequency(kmer, &collection_sample); + let kmer_score = estimate_frequency(kmer, collection_sample); ctx.frequencies.insert(*kmer, kmer_score); segment_score += kmer_score; } diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs index 28b72932d..f55eff608 100644 --- a/src/dictionary/mod.rs +++ b/src/dictionary/mod.rs @@ -50,8 +50,8 @@ pub struct DictParams { /// As found under "4. Experiments - Varying Segment Size" in the original paper, a /// segment size of 2 kiB was effective. /// - /// "We explored a range of [segment_size] values and found the performance of LMC is insensitive - /// to [segment_size]. We fix [segment_size] to 2kiB + /// "We explored a range of \[`segment_size`\] values and found the performance of LMC is insensitive + /// to \[`segment_size`\]. We fix \[`segment_size`\] to 2kiB /// /// Reasonable range: [16, 2048+] pub segment_size: u32, @@ -85,7 +85,7 @@ pub fn create_raw_dict_from_dir, W: io::Write>( for entry in dir { let entry = entry?; if entry.file_type()?.is_dir() { - recurse_read(fs::read_dir(&entry.path())?, file_paths)?; + recurse_read(fs::read_dir(entry.path())?, file_paths)?; } else { file_paths.push(entry.path()); } diff --git a/src/dictionary/reservoir.rs b/src/dictionary/reservoir.rs index 041ebc498..6fb318c91 100644 --- a/src/dictionary/reservoir.rs +++ b/src/dictionary/reservoir.rs @@ -2,7 +2,7 @@ use super::cover::K; use alloc::vec::Vec; use core::f64::consts::E; use fastrand; -use std::io; +use std::{io, vec}; /// Creates a representative sample of `input` of `size` bytes. pub fn create_sample(input: &mut R, size: usize) -> Vec { @@ -31,8 +31,7 @@ impl Reservoir { /// Initialize a new empty reservoir, creating an allocation of `size`. pub fn new(size: usize) -> Self { assert!(size >= 16, "Reservoirs cannot be below 16 bytes in size"); - let mut lake = Vec::with_capacity(size); - lake.resize(size, 0); + let lake: Vec = vec![0; size]; let k = K as u16; Self { lake, k } } diff --git a/src/lib.rs b/src/lib.rs index 49366d80d..62fd4b5a6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,6 +35,7 @@ macro_rules! vprintln { mod bit_io; mod common; pub mod decoding; +#[cfg(feature = "dict_builder")] pub mod dictionary; pub mod encoding; diff --git a/src/tests/decode_corpus.rs b/src/tests/decode_corpus.rs index 69844e45e..369b59299 100644 --- a/src/tests/decode_corpus.rs +++ b/src/tests/decode_corpus.rs @@ -7,6 +7,7 @@ fn test_decode_corpus_files() { use alloc::string::{String, ToString}; use alloc::vec::Vec; use std::fs; + use std::io::BufReader; use std::io::Read; use std::println; @@ -82,7 +83,7 @@ fn test_decode_corpus_files() { let mut original_p = p.clone(); original_p.truncate(original_p.len() - 4); - let original_f = fs::File::open(original_p).unwrap(); + let original_f = BufReader::new(fs::File::open(original_p).unwrap()); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); println!("Results for file: {}", p.clone()); diff --git a/src/tests/dict_test.rs b/src/tests/dict_test.rs index 516b1782e..60e4a90d7 100644 --- a/src/tests/dict_test.rs +++ b/src/tests/dict_test.rs @@ -83,6 +83,7 @@ fn test_dict_decoding() { use alloc::string::{String, ToString}; use alloc::vec::Vec; use std::fs; + use std::io::BufReader; use std::io::Read; use std::println; @@ -97,7 +98,7 @@ fn test_dict_decoding() { let mut speeds_read = Vec::new(); let mut files: Vec<_> = fs::read_dir("./dict_tests/files").unwrap().collect(); - let dict = fs::File::open("./dict_tests/dictionary").unwrap(); + let dict = BufReader::new(fs::File::open("./dict_tests/dictionary").unwrap()); let dict: Vec = dict.bytes().map(|x| x.unwrap()).collect(); files.sort_by_key(|x| match x { @@ -155,7 +156,7 @@ fn test_dict_decoding() { let mut original_p = p.clone(); original_p.truncate(original_p.len() - 4); - let original_f = fs::File::open(original_p).unwrap(); + let original_f = BufReader::new(fs::File::open(original_p).unwrap()); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); println!("Results for file: {}", p.clone()); diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 52fda6ddd..3a47122a5 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -130,8 +130,9 @@ fn test_frame_decoder() { fn test_decode_from_to() { use crate::decoding::FrameDecoder; use std::fs::File; + use std::io::BufReader; use std::io::Read; - let f = File::open("./decodecorpus_files/z000088.zst").unwrap(); + let f = BufReader::new(File::open("./decodecorpus_files/z000088.zst").unwrap()); let mut frame_dec = FrameDecoder::new(); let content: Vec = f.bytes().map(|x| x.unwrap()).collect(); @@ -197,7 +198,7 @@ fn test_decode_from_to() { None => std::println!("No checksums to test\n"), } - let original_f = File::open("./decodecorpus_files/z000088").unwrap(); + let original_f = BufReader::new(File::open("./decodecorpus_files/z000088").unwrap()); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); if original.len() != result.len() { @@ -233,6 +234,7 @@ fn test_specific_file() { use crate::decoding::BlockDecodingStrategy; use crate::decoding::FrameDecoder; use std::fs; + use std::io::BufReader; use std::io::Read; let path = "./decodecorpus_files/z000068.zst"; @@ -256,7 +258,7 @@ fn test_specific_file() { .unwrap(); let result = frame_dec.collect().unwrap(); - let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap(); + let original_f = BufReader::new(fs::File::open("./decodecorpus_files/z000088").unwrap()); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); std::println!("Results for file: {}", path); @@ -293,6 +295,7 @@ fn test_specific_file() { #[cfg(feature = "std")] fn test_streaming() { use std::fs; + use std::io::BufReader; use std::io::Read; let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap(); @@ -301,7 +304,7 @@ fn test_streaming() { let mut result = Vec::new(); Read::read_to_end(&mut stream, &mut result).unwrap(); - let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap(); + let original_f = BufReader::new(fs::File::open("./decodecorpus_files/z000088").unwrap()); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); if original.len() != result.len() { @@ -343,7 +346,7 @@ fn test_streaming() { let mut result = Vec::new(); Read::read_to_end(&mut stream, &mut result).unwrap(); - let original_f = fs::File::open("./decodecorpus_files/z000068").unwrap(); + let original_f = BufReader::new(fs::File::open("./decodecorpus_files/z000068").unwrap()); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); std::println!("Results for file:"); From ff3d5a7f6943488a003e3e64e61de59d93d60766 Mon Sep 17 00:00:00 2001 From: arc Date: Tue, 19 Aug 2025 16:02:37 -0600 Subject: [PATCH 14/16] docs: include some rustdoc metadata --- Cargo.toml | 4 ++++ src/dictionary/mod.rs | 2 +- src/lib.rs | 5 +++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7fabc8906..64ec9f513 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,10 @@ readme = "Readme.md" keywords = ["zstd", "zstandard", "decompression"] categories = ["compression"] +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + [dependencies] twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"], optional = true } diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs index f55eff608..48fdbcb99 100644 --- a/src/dictionary/mod.rs +++ b/src/dictionary/mod.rs @@ -44,7 +44,7 @@ use std::{ /// /// Changing these values can improve the resulting dictionary size for certain datasets. // TODO: move `k` here. -pub struct DictParams { +pub(super) struct DictParams { /// Segment size. /// /// As found under "4. Experiments - Varying Segment Size" in the original paper, a diff --git a/src/lib.rs b/src/lib.rs index 62fd4b5a6..7f3106eca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,8 +7,8 @@ //! //! ## Compression //! The [encoding] module contains the code for compression. -//! Decompression can be achieved by using the [`encoding::compress`]/[`encoding::compress_to_vec`] -//! functions or the [`encoding::FrameCompressor`] +//! Compression can be achieved by using the [`encoding::compress`]/[`encoding::compress_to_vec`] +//! functions or [`encoding::FrameCompressor`] //! #![doc = include_str!("../Readme.md")] #![no_std] @@ -36,6 +36,7 @@ mod bit_io; mod common; pub mod decoding; #[cfg(feature = "dict_builder")] +#[cfg_attr(docsrs, doc(cfg(feature = "dict_builder")))] pub mod dictionary; pub mod encoding; From 38c7c8c89261944d34395977fd46c03ae118b5a6 Mon Sep 17 00:00:00 2001 From: arc Date: Tue, 19 Aug 2025 16:15:43 -0600 Subject: [PATCH 15/16] lint: fixing clippy --- Readme.md | 2 +- src/bit_io/bit_reader.rs | 2 +- src/bit_io/bit_writer.rs | 10 +++++----- src/dictionary/mod.rs | 5 +++-- src/fse/fse_decoder.rs | 2 +- src/huff0/huff0_decoder.rs | 2 +- 6 files changed, 12 insertions(+), 11 deletions(-) diff --git a/Readme.md b/Readme.md index c96e787b4..79d2ff943 100644 --- a/Readme.md +++ b/Readme.md @@ -46,7 +46,7 @@ provides the ability to create new dictionaries. On the `github-users` sample set, our implementation benchmarks within 0.2% of the official implementation (as of commit `09e52d07340acdb2e13817b066e8be6e424f7258`): -``` +```no_build uncompressed: 100.00% (7484607 bytes) no dict: 34.99% of original size (2618872 bytes) reference dict: 16.16% of no dict size (2195672 bytes smaller) diff --git a/src/bit_io/bit_reader.rs b/src/bit_io/bit_reader.rs index c8987250e..2140ddb3b 100644 --- a/src/bit_io/bit_reader.rs +++ b/src/bit_io/bit_reader.rs @@ -66,7 +66,7 @@ impl<'s> BitReader<'s> { let mut bit_shift = bits_left_in_current_byte; //this many bits are already set in value - assert!(self.idx % 8 == 0); + assert!(self.idx.is_multiple_of(8)); //collect full bytes for _ in 0..full_bytes_needed { diff --git a/src/bit_io/bit_writer.rs b/src/bit_io/bit_writer.rs index fb809926c..7ce228a54 100644 --- a/src/bit_io/bit_writer.rs +++ b/src/bit_io/bit_writer.rs @@ -45,7 +45,7 @@ impl>> BitWriter { /// Reset to an index. Currently only supports resetting to a byte aligned index pub fn reset_to(&mut self, index: usize) { - assert!(index % 8 == 0); + assert!(index.is_multiple_of(8)); self.partial = 0; self.bits_in_partial = 0; self.bit_idx = index; @@ -66,7 +66,7 @@ impl>> BitWriter { // We might be changing bits unaligned to byte borders. // This means the lower bits of the first byte we are touching must stay the same - if idx % 8 != 0 { + if !idx.is_multiple_of(8) { // How many (upper) bits will change in the first byte? let bits_in_first_byte = 8 - (idx % 8); // We don't support only changing a few bits in the middle of a byte @@ -82,7 +82,7 @@ impl>> BitWriter { idx += bits_in_first_byte; } - assert!(idx % 8 == 0); + assert!(idx.is_multiple_of(8)); // We are now byte aligned, change idx to byte resolution let mut idx = idx / 8; @@ -113,7 +113,7 @@ impl>> BitWriter { /// Flush temporary internal buffers to the output buffer. Only works if this is currently byte aligned pub fn flush(&mut self) { - assert!(self.bits_in_partial % 8 == 0); + assert!(self.bits_in_partial.is_multiple_of(8)); let full_bytes = self.bits_in_partial / 8; self.output .as_mut() @@ -204,7 +204,7 @@ impl>> BitWriter { /// Returns how many bits are missing for an even byte pub fn misaligned(&self) -> usize { let idx = self.index(); - if idx % 8 == 0 { + if idx.is_multiple_of(8) { 0 } else { 8 - (idx % 8) diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs index 48fdbcb99..322f68c90 100644 --- a/src/dictionary/mod.rs +++ b/src/dictionary/mod.rs @@ -68,10 +68,11 @@ pub(super) struct DictParams { /// /// # Examples /// ```no_run +/// use std::fs::File; /// // Create a roughly 1mb dictionary, training off of file in `sample_files` /// let input_folder = "sample_files/"; -/// let output = File::create("output.dict"); -/// ruzstd::dict::create_dict_from_dir(input_folder, &mut output, 1_000_000); +/// let mut output = File::create("output.dict").unwrap(); +/// ruzstd::dictionary::create_raw_dict_from_dir(input_folder, &mut output, 1_000_000); /// ``` pub fn create_raw_dict_from_dir, W: io::Write>( path: P, diff --git a/src/fse/fse_decoder.rs b/src/fse/fse_decoder.rs index bf573c1b0..7cd59dc6d 100644 --- a/src/fse/fse_decoder.rs +++ b/src/fse/fse_decoder.rs @@ -297,7 +297,7 @@ impl FSETable { }); } - let bytes_read = if br.bits_read() % 8 == 0 { + let bytes_read = if br.bits_read().is_multiple_of(8) { br.bits_read() / 8 } else { (br.bits_read() / 8) + 1 diff --git a/src/huff0/huff0_decoder.rs b/src/huff0/huff0_decoder.rs index 5c3e98bf0..1952aea3c 100644 --- a/src/huff0/huff0_decoder.rs +++ b/src/huff0/huff0_decoder.rs @@ -245,7 +245,7 @@ impl HuffmanTable { let num_weights = header - 127; self.weights.resize(num_weights as usize, 0); - let bytes_needed = if num_weights % 2 == 0 { + let bytes_needed = if num_weights.is_multiple_of(2) { num_weights as usize / 2 } else { (num_weights as usize / 2) + 1 From a598241c977cf3bd35edebe05763f2bbda5195dc Mon Sep 17 00:00:00 2001 From: arc Date: Thu, 21 Aug 2025 06:43:38 -0600 Subject: [PATCH 16/16] pr(cleanup): apply feedback from pull/91 - Fix typo in cargo.toml - set VERBOSE to false and add a test to verify it's false - remove commented out bench code from zstd_dict.rs --- Cargo.toml | 2 +- src/bin/zstd_dict.rs | 116 ------------------------------------------- src/lib.rs | 6 +-- src/tests/mod.rs | 7 +++ 4 files changed, 11 insertions(+), 120 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 64ec9f513..5a7cec4a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"], optional = true } -# Internal feature, only used when building as part of libstd, not part of theea +# Internal feature, only used when building as part of libstd, not part of the # stable interface of this crate. compiler_builtins = { version = "0.1.2", optional = true } core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" } diff --git a/src/bin/zstd_dict.rs b/src/bin/zstd_dict.rs index 6ec26c92e..54a4d2651 100644 --- a/src/bin/zstd_dict.rs +++ b/src/bin/zstd_dict.rs @@ -22,119 +22,3 @@ fn main() { create_raw_dict_from_dir(input_path, &mut output, dict_size).unwrap(); } } - -//struct BenchmarkResults { -// pub uncompressed_size: usize, -// pub nodict_size: usize, -// pub reference_size: usize, -// pub our_size: usize, -//} -// -//impl Display for BenchmarkResults { -// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { -// writeln!(f, "uncompressed: 100.00% ({})", self.uncompressed_size)?; -// writeln!( -// f, -// "no dict: {:.2}% of original size ({})", -// f64::from(self.nodict_size as u32) / f64::from(self.uncompressed_size as u32) * 100.0, -// self.nodict_size -// )?; -// writeln!( -// f, -// "reference dict: {:.2}% of no dict size ({} bytes smaller)", -// f64::from(self.reference_size as u32) / f64::from(self.nodict_size as u32) * 100.0, -// self.nodict_size - self.reference_size -// )?; -// write!( -// f, -// "our dict: {:.2}% of no dict size ({} bytes smaller)", -// f64::from(self.our_size as u32) / f64::from(self.nodict_size as u32) * 100.0, -// self.nodict_size - self.our_size -// )?; -// Ok(()) -// } -//} -// -//struct Dumpster(pub usize); -// -//impl Write for Dumpster { -// fn write(&mut self, buf: &[u8]) -> io::Result { -// self.0 += buf.len(); -// Ok(buf.len()) -// } -// -// fn flush(&mut self) -> io::Result<()> { -// Ok(()) -// } -//} -// -///// Compares compression ratios achieved with a dictionary -//#[allow(unused)] -//fn bench>(input_path: P) -> BenchmarkResults { -// // At what compression level the dicts are built with -// let compression_level = 1; -// // 1. Collect a list of a path to every file in the directory into `file_paths` -// println!("[bench]: collecting list of input files"); -// let mut file_paths: Vec = Vec::new(); -// let dir: fs::ReadDir = fs::read_dir(&input_path).expect("read input path"); -// fn recurse_read(dir: fs::ReadDir, file_paths: &mut Vec) -> Result<(), io::Error> { -// for entry in dir { -// let entry = entry?; -// if entry.file_type()?.is_dir() { -// recurse_read(fs::read_dir(&entry.path())?, file_paths)?; -// } else { -// file_paths.push(entry.path()); -// } -// } -// Ok(()) -// } -// recurse_read(dir, &mut file_paths).expect("recursing over input dir"); -// -// // 2. Create two dictionaries, one with our strategy, and one with theirs -// println!("[bench]: creating reference dict"); -// let reference_dict = -// zstd::dict::from_files(file_paths.iter(), 112640).expect("create reference dict"); -// let mut our_dict = Vec::with_capacity(112640); -// println!("[bench]: creating our dict"); -// create_dict_from_dir(input_path, &mut our_dict, 112640).expect("create our dict"); -// // Open each file and compress it -// let mut uncompressed_size: usize = 0; -// let mut nodict_size: usize = 0; -// -// let mut reference_output = Dumpster(0); -// let mut reference_encoder = -// zstd::Encoder::with_dictionary(&mut reference_output, compression_level, &reference_dict) -// .unwrap(); -// reference_encoder.multithread(8).unwrap(); -// let mut our_output = Dumpster(0); -// let mut our_encoder = -// zstd::Encoder::with_dictionary(&mut our_output, compression_level, &our_dict).unwrap(); -// our_encoder.multithread(8).unwrap(); -// for (idx, path) in file_paths.iter().enumerate() { -// if idx % 10 == 0 { -// println!("[bench]: compressing file {}/{}", idx + 1, file_paths.len()); -// } -// let mut handle = File::open(path).unwrap(); -// let mut data = Vec::new(); -// handle.read_to_end(&mut data).unwrap(); -// uncompressed_size += data.len(); -// // Compress with no dict -// let nodict_output = zstd::encode_all(data.as_slice(), compression_level).unwrap(); -// nodict_size += nodict_output.len(); -// // Compress with the reference dict -// reference_encoder -// .write_all(data.as_slice()) -// .expect("reference writer writing"); -// // Compress with our dict -// our_encoder -// .write_all(data.as_slice()) -// .expect("our writer writing"); -// } -// -// BenchmarkResults { -// uncompressed_size, -// nodict_size, -// reference_size: reference_output.0, -// our_size: our_output.0, -// } -//} diff --git a/src/lib.rs b/src/lib.rs index 7f3106eca..0f85407a4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,7 +21,7 @@ extern crate std; extern crate alloc; #[cfg(feature = "std")] -pub(crate) const VERBOSE: bool = true; +pub(crate) const VERBOSE: bool = false; macro_rules! vprintln { ($($x:expr),*) => { @@ -52,8 +52,6 @@ pub(crate) mod fse; #[cfg(not(feature = "fuzz_exports"))] pub(crate) mod huff0; -mod tests; - #[cfg(feature = "std")] pub mod io_std; @@ -65,3 +63,5 @@ pub mod io_nostd; #[cfg(not(feature = "std"))] pub use io_nostd as io; + +mod tests; diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 15580db3c..13090296e 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -579,3 +579,10 @@ pub mod dict_test; #[cfg(feature = "std")] pub mod encode_corpus; pub mod fuzz_regressions; + +#[cfg(feature = "std")] +#[test] +fn verbose_disabled() { + use crate::VERBOSE; + assert_eq!(VERBOSE, false); +}