diff --git a/.gitignore b/.gitignore index 118714d18..6cb4d5724 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ **/*.rs.bk Cargo.lock /local_corpus_files +/local_dict_corpus_files /orig-zstd fuzz_decodecorpus perf.data* diff --git a/Cargo.toml b/Cargo.toml index 884b83591..5a7cec4a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,10 @@ readme = "Readme.md" keywords = ["zstd", "zstandard", "decompression"] categories = ["compression"] +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + [dependencies] twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"], optional = true } @@ -20,17 +24,20 @@ twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"] compiler_builtins = { version = "0.1.2", optional = true } core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" } alloc = { version = "1.0.0", optional = true, package = "rustc-std-workspace-alloc" } +fastrand = "2.3.0" + [dev-dependencies] criterion = "0.5" rand = { version = "0.8.5", features = ["small_rng"] } -zstd = "0.13.2" +zstd = { version = "0.13.2", features = ["zstdmt"]} [features] default = ["hash", "std"] hash = ["dep:twox-hash"] fuzz_exports = [] std = [] +dict_builder = ["std"] # Internal feature, only used when building as part of libstd, not part of the # stable interface of this crate. @@ -47,3 +54,7 @@ required-features = ["std"] [[bin]] name = "zstd_stream" required-features = ["std"] + +[[bin]] +name = "zstd_dict" +required-features = ["std", "dict_builder"] diff --git a/Readme.md b/Readme.md index 3281a3d02..79d2ff943 100644 --- a/Readme.md +++ b/Readme.md @@ -15,8 +15,20 @@ This crate is currently actively maintained. # Current Status -Feature complete on the decoder side. +## Decompression +The `decoding` module provides a complete +implementation of a Zstandard decompressor. + +In terms of speed, `ruzstd` is behind the original C implementation +which has a rust binding located [here](https://github.com/gyscos/zstd-rs). + +Measuring with the 'time' utility the original zstd and my decoder both +decoding the same enwik9.zst file from a ramfs, my decoder is about 3.5 +times slower. Enwik9 is highly compressible, for less compressible data +(like a ubuntu installation .iso) my decoder comes close to only being +1.4 times slower. +## Compression On the compression side: - Support for generating compressed blocks at any compression level - [x] Uncompressed @@ -24,13 +36,28 @@ On the compression side: - [ ] Default (roughly level 3) - [ ] Better (roughly level 7) - [ ] Best (roughly level 11) -- [ ] Checksums +- [x] Checksums - [ ] Dictionaries -## Speed -In terms of speed this library is behind the original C implementation which has a rust binding located [here](https://github.com/gyscos/zstd-rs). +## Dictionary Generation +When the `dict_builder` feature is enabled, the `dictionary` module +provides the ability to create new dictionaries. + +On the `github-users` sample set, our implementation benchmarks within +0.2% of the official implementation (as of commit +`09e52d07340acdb2e13817b066e8be6e424f7258`): +```no_build +uncompressed: 100.00% (7484607 bytes) +no dict: 34.99% of original size (2618872 bytes) +reference dict: 16.16% of no dict size (2195672 bytes smaller) +our dict: 16.28% of no dict size (2192400 bytes smaller) +``` + +The dictionary generator only provides support for creating "raw +content" dictionaries. Tagged dictionaries are currently unsupported. -Measuring with the 'time' utility the original zstd and my decoder both decoding the same enwik9.zst file from a ramfs, my decoder is about 3.5 times slower. Enwik9 is highly compressible, for less compressible data (like a ubuntu installation .iso) my decoder comes close to only being 1.4 times slower. +See +for clarification. # How can you use it? diff --git a/src/bin/zstd.rs b/src/bin/zstd.rs index 4ec9db77e..4c629b7cf 100644 --- a/src/bin/zstd.rs +++ b/src/bin/zstd.rs @@ -21,7 +21,7 @@ struct StateTracker { file_size: u64, old_percentage: i8, } - +#[allow(unused)] fn decompress(flags: &[String], file_paths: &[String]) { if !flags.contains(&"-d".to_owned()) { eprintln!("This zstd implementation only supports decompression. Please add a \"-d\" flag"); @@ -128,6 +128,7 @@ fn decompress(flags: &[String], file_paths: &[String]) { } } +#[allow(unused)] struct PercentPrintReader { total: usize, counter: usize, diff --git a/src/bin/zstd_dict.rs b/src/bin/zstd_dict.rs new file mode 100644 index 000000000..54a4d2651 --- /dev/null +++ b/src/bin/zstd_dict.rs @@ -0,0 +1,24 @@ +use ruzstd::dictionary::{create_raw_dict_from_dir, create_raw_dict_from_source}; +use std::env::args; +use std::fs::File; +use std::path::Path; + +fn main() { + let args: Vec = args().collect(); + let input_path: &Path = args.get(1).expect("no input provided").as_ref(); + let output_path: &Path = args.get(2).expect("no output path provided").as_ref(); + let dict_size = args + .get(3) + .expect("no dict size provided (kb)") + .parse::() + .expect("dict size was not a valid num"); + + let mut output = File::create(output_path).unwrap(); + if input_path.is_file() { + let source = File::open(input_path).expect("unable to open input path"); + let source_size = source.metadata().unwrap().len(); + create_raw_dict_from_source(source, source_size as usize, &mut output, dict_size); + } else { + create_raw_dict_from_dir(input_path, &mut output, dict_size).unwrap(); + } +} diff --git a/src/bit_io/bit_reader.rs b/src/bit_io/bit_reader.rs index 0d3e807aa..2140ddb3b 100644 --- a/src/bit_io/bit_reader.rs +++ b/src/bit_io/bit_reader.rs @@ -66,7 +66,7 @@ impl<'s> BitReader<'s> { let mut bit_shift = bits_left_in_current_byte; //this many bits are already set in value - assert!(self.idx % 8 == 0); + assert!(self.idx.is_multiple_of(8)); //collect full bytes for _ in 0..full_bytes_needed { @@ -116,7 +116,7 @@ impl core::fmt::Display for GetBitsError { } => { write!( f, - "Cant serve this request. The reader is limited to {limit} bits, requested {num_requested_bits} bits", + "Cant serve this request. The reader is limited to {limit} bits, requested {num_requested_bits} bits" ) } GetBitsError::NotEnoughRemainingBits { @@ -125,7 +125,7 @@ impl core::fmt::Display for GetBitsError { } => { write!( f, - "Can\'t read {requested} bits, only have {remaining} bits left", + "Can\'t read {requested} bits, only have {remaining} bits left" ) } } diff --git a/src/bit_io/bit_writer.rs b/src/bit_io/bit_writer.rs index fb809926c..7ce228a54 100644 --- a/src/bit_io/bit_writer.rs +++ b/src/bit_io/bit_writer.rs @@ -45,7 +45,7 @@ impl>> BitWriter { /// Reset to an index. Currently only supports resetting to a byte aligned index pub fn reset_to(&mut self, index: usize) { - assert!(index % 8 == 0); + assert!(index.is_multiple_of(8)); self.partial = 0; self.bits_in_partial = 0; self.bit_idx = index; @@ -66,7 +66,7 @@ impl>> BitWriter { // We might be changing bits unaligned to byte borders. // This means the lower bits of the first byte we are touching must stay the same - if idx % 8 != 0 { + if !idx.is_multiple_of(8) { // How many (upper) bits will change in the first byte? let bits_in_first_byte = 8 - (idx % 8); // We don't support only changing a few bits in the middle of a byte @@ -82,7 +82,7 @@ impl>> BitWriter { idx += bits_in_first_byte; } - assert!(idx % 8 == 0); + assert!(idx.is_multiple_of(8)); // We are now byte aligned, change idx to byte resolution let mut idx = idx / 8; @@ -113,7 +113,7 @@ impl>> BitWriter { /// Flush temporary internal buffers to the output buffer. Only works if this is currently byte aligned pub fn flush(&mut self) { - assert!(self.bits_in_partial % 8 == 0); + assert!(self.bits_in_partial.is_multiple_of(8)); let full_bytes = self.bits_in_partial / 8; self.output .as_mut() @@ -204,7 +204,7 @@ impl>> BitWriter { /// Returns how many bits are missing for an even byte pub fn misaligned(&self) -> usize { let idx = self.index(); - if idx % 8 == 0 { + if idx.is_multiple_of(8) { 0 } else { 8 - (idx % 8) diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs new file mode 100644 index 000000000..093b8b656 --- /dev/null +++ b/src/dictionary/cover.rs @@ -0,0 +1,134 @@ +//! An implementation of the local maximum coverage algorithm +//! described in the paper "Effective Construction of Relative Lempel-Ziv Dictionaries", +//! by Liao, Petri, Moffat, and Wirth, published under the University of Melbourne. +//! +//! See: +//! +//! Facebook's implementation was also used as a reference. +//! + +use super::DictParams; +use crate::dictionary::frequency::estimate_frequency; +use core::convert::TryInto; +use std::collections::HashMap; +use std::vec::Vec; + +/// The size of each k-mer +pub(super) const K: usize = 16; + +///As found under "4: Experiments - Varying k-mer Size" in the original paper, +/// "when k = 16, across all our text collections, there is a reasonable spread" +/// +/// Reasonable range: [6, 16] +pub(super) type KMer = [u8; K]; + +pub struct Segment { + /// The actual contents of the segment. + pub raw: Vec, + /// A measure of how "ideal" a given segment would be to include in the dictionary + /// + /// Higher is better, there's no upper limit. This number is determined by + /// estimating the number of occurances in a given epoch + pub score: usize, +} + +impl Eq for Segment {} + +impl PartialEq for Segment { + fn eq(&self, other: &Self) -> bool { + // We only really care about score in regards to heap order + self.score == other.score + } +} + +impl PartialOrd for Segment { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Segment { + fn cmp(&self, other: &Self) -> core::cmp::Ordering { + self.score.cmp(&other.score) + } +} + +/// A re-usable allocation containing large allocations +/// that are used multiple times during dictionary construction (once per epoch) +pub struct Context { + /// Keeps track of the number of occurances of a particular k-mer within an epoch. + /// + /// Reset for each epoch. + pub frequencies: HashMap, +} + +/// Returns the highest scoring segment in an epoch +/// as a slice of that epoch. +pub fn pick_best_segment( + params: &DictParams, + ctx: &mut Context, + collection_sample: &'_ [u8], +) -> Segment { + let mut segments = collection_sample + .chunks(params.segment_size as usize) + .peekable(); + let mut best_segment: &[u8] = segments.peek().expect("at least one segment"); + let mut top_segment_score: usize = 0; + // Iterate over segments and score each segment, keeping track of the best segment + for segment in segments { + let segment_score = score_segment(ctx, collection_sample, segment); + if segment_score > top_segment_score { + best_segment = segment; + top_segment_score = segment_score; + } + } + + Segment { + raw: best_segment.into(), + score: top_segment_score, + } +} + +/// Given a segment, compute the score (or usefulness) of that segment against the entire epoch. +/// +/// `score_segment` modifies `ctx.frequencies`. +fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> usize { + let mut segment_score = 0; + // Determine the score of each overlapping k-mer + for i in 0..(segment.len() - K - 1) { + let kmer: &KMer = (&segment[i..i + K]) + .try_into() + .expect("Failed to make kmer"); + // if the kmer is already in the pool, it recieves a score of zero + if ctx.frequencies.contains_key(kmer) { + continue; + } + let kmer_score = estimate_frequency(kmer, collection_sample); + ctx.frequencies.insert(*kmer, kmer_score); + segment_score += kmer_score; + } + + segment_score +} + +/// Computes the number of epochs and the size of each epoch. +/// +/// Returns a (number of epochs, epoch size) tuple. +/// +/// A translation of `COVER_epoch_info_t COVER_computeEpochs()` from facebook/zstd. +pub fn compute_epoch_info( + params: &DictParams, + max_dict_size: usize, + num_kmers: usize, +) -> (usize, usize) { + let min_epoch_size = 10_000; // 10 KiB + let mut num_epochs: usize = usize::max(1, max_dict_size / params.segment_size as usize); + let mut epoch_size: usize = num_kmers / num_epochs; + if epoch_size >= min_epoch_size { + assert!(epoch_size * num_epochs <= num_kmers); + return (num_epochs, epoch_size); + } + epoch_size = usize::min(min_epoch_size, num_kmers); + num_epochs = num_kmers / epoch_size; + (num_epochs, epoch_size) +} diff --git a/src/dictionary/frequency.rs b/src/dictionary/frequency.rs new file mode 100644 index 000000000..074e73839 --- /dev/null +++ b/src/dictionary/frequency.rs @@ -0,0 +1,71 @@ +//! Contains `compute_frequency`, a function +//! that uses a rolling Karp-Rabin hash to +//! efficiently count the number of occurences +//! of a given k-mer within a set. + +/// Computes a best effort guess as to how many times `pattern` occurs within +/// `body`. While not 100% accurate, it will be accurate the vast majority of time +pub fn estimate_frequency(pattern: &[u8], body: &[u8]) -> usize { + assert!(body.len() >= pattern.len()); + // A prime number for modulo operations to reduce collisions (q) + const PRIME: isize = 2654435761; + // Number of characters in the input alphabet (d) + const ALPHABET_SIZE: isize = 256; + // Hash of input pattern (p) + let mut pattern_hash: isize = 0; + // Hash of the current window of text (t) + let mut window_hash: isize = 0; + // High-order digit multiplier (h) + let mut h: isize = 1; + + // Precompute h (?) + h = (h * ALPHABET_SIZE) % PRIME; + + // Compute initial hash values + for i in 0..pattern.len() { + pattern_hash = (ALPHABET_SIZE * pattern_hash + pattern[i] as isize) % PRIME; + window_hash = (ALPHABET_SIZE * window_hash + body[i] as isize) % PRIME; + } + + let mut num_occurances = 0; + for i in 0..=body.len() - pattern.len() { + // There's *probably* a match if these two match + if pattern_hash == window_hash { + num_occurances += 1; + } + + // Compute hash values for next window + if i < body.len() - pattern.len() { + window_hash = (ALPHABET_SIZE * (window_hash - body[i] as isize * h) + + body[i + pattern.len()] as isize) + % PRIME; + } + } + + num_occurances +} + +#[cfg(test)] +mod tests { + use super::estimate_frequency; + #[test] + fn dead_beef() { + assert_eq!( + estimate_frequency(&[0xde, 0xad], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]), + 2 + ); + } + + #[test] + fn smallest_body() { + assert_eq!(estimate_frequency(&[0x00, 0xff], &[0x00, 0xff]), 1); + } + + #[test] + fn no_match() { + assert_eq!( + estimate_frequency(&[0xff, 0xff], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]), + 0 + ); + } +} diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs new file mode 100644 index 000000000..322f68c90 --- /dev/null +++ b/src/dictionary/mod.rs @@ -0,0 +1,189 @@ +//! Code for creating a separate content dictionary. +//! +//! Effective dictionaries are up to 1% the size of the complete training body, +//! and are trained on many examples of the original data. +//! +//! Implemented following the paper "Effective construction of +//! Relative Lempel-Ziv Dictionaries", by Kewen Liao, Matthias Petri, +//! Alistair Moffat, and Anthony Wirth + +// The algorithm is summarized here +// 1. The text is split into "epochs", or chunks from the original source +// 2. From within each epoch, we select the "segment", or 1 KiB contiguous section +// that's predicted to be the best option to include in the dictionary. Concatenated, +// these segments form the dictionary. +// +// This segment scoring algorithm operates as follows: +// For a given epoch: +// - Run a reservoir sampler over the entire epoch, creating a +// reservoir of n/t, where `t` is the desired number of occurances +// we want the most common k-mers to have +// - Have the ability to estimate +// the frequency of a given k-mer: `f(w: k-mer)` calculates +// the frequency of w in the reservoir using a rolling karp-rabin hash +// - The score of a segment is the sum of `f(w)` called on every kmer within the segment +mod cover; +mod frequency; +mod reservoir; + +use crate::dictionary::reservoir::create_sample; +use alloc::vec; +use core::cmp::Reverse; +use cover::*; +use std::{ + boxed::Box, + collections::{BinaryHeap, HashMap}, + dbg, + fs::{self, File}, + io::{self, BufReader, Read}, + path::{Path, PathBuf}, + vec::Vec, +}; + +/// A set of values that are used during dictionary construction. +/// +/// Changing these values can improve the resulting dictionary size for certain datasets. +// TODO: move `k` here. +pub(super) struct DictParams { + /// Segment size. + /// + /// As found under "4. Experiments - Varying Segment Size" in the original paper, a + /// segment size of 2 kiB was effective. + /// + /// "We explored a range of \[`segment_size`\] values and found the performance of LMC is insensitive + /// to \[`segment_size`\]. We fix \[`segment_size`\] to 2kiB + /// + /// Reasonable range: [16, 2048+] + pub segment_size: u32, +} + +/// Creates a "raw content" dictionary, training off of every file in this directory and all +/// sub-directories. +/// +/// The resulting dictionary will be approxamitely `dict_size` or less, and written to `output`. +/// +/// # Errors +/// This function returns `Ok(())` if the dictionary was created successfully, and an +/// `Err(io::Error)` if an error was encountered reading the input directory. +/// +/// # Examples +/// ```no_run +/// use std::fs::File; +/// // Create a roughly 1mb dictionary, training off of file in `sample_files` +/// let input_folder = "sample_files/"; +/// let mut output = File::create("output.dict").unwrap(); +/// ruzstd::dictionary::create_raw_dict_from_dir(input_folder, &mut output, 1_000_000); +/// ``` +pub fn create_raw_dict_from_dir, W: io::Write>( + path: P, + output: &mut W, + dict_size: usize, +) -> Result<(), io::Error> { + // Collect a list of a path to every file in the directory into `file_paths` + let mut file_paths: Vec = Vec::new(); + let dir: fs::ReadDir = fs::read_dir(path)?; + fn recurse_read(dir: fs::ReadDir, file_paths: &mut Vec) -> Result<(), io::Error> { + for entry in dir { + let entry = entry?; + if entry.file_type()?.is_dir() { + recurse_read(fs::read_dir(entry.path())?, file_paths)?; + } else { + file_paths.push(entry.path()); + } + } + Ok(()) + } + recurse_read(dir, &mut file_paths)?; + + // Open each file and chain the readers together + let mut total_file_len: u64 = 0; + let mut file_handles: Vec = Vec::new(); + for path in file_paths { + let handle = File::open(path)?; + total_file_len += handle.metadata()?.len(); + file_handles.push(handle); + } + let empty_reader: Box = Box::new(io::empty()); + let chained_files = file_handles + .iter() + .fold(empty_reader, |acc, reader| Box::new(acc.chain(reader))); + + // Create a dict using the new reader + create_raw_dict_from_source(chained_files, total_file_len as usize, output, dict_size); + Ok(()) +} + +/// Read from `source` to create a "raw content" dictionary of `dict_size`. +/// The completed dictionary is written to `output`. +/// +/// - `source` will be used as training data for the entire dictionary. +/// - `source_size` influences how the data is divided and sampled and is measured +/// in bytes. While this does not need to be exact, estimates should attempt to be +/// larger than the actual collection size. +/// - `output` is where the completed dictionary will be written. +/// - `dict_size` determines how large the complete dictionary should be. The completed +/// dictionary will be this size or smaller. +/// +/// This function uses `BufRead` internally, the provided reader need not be buffered. +pub fn create_raw_dict_from_source( + source: R, + source_size: usize, + output: &mut W, + dict_size: usize, +) { + vprintln!("create_dict: creating {dict_size} byte dict from {source_size} byte source"); + let mut buffered_source = BufReader::with_capacity(128_000, source); + + let params = DictParams { segment_size: 2048 }; + let num_segments = source_size / params.segment_size as usize; + // According to 4. Experiments - Varying Reservoir Sampler Thresholds, + // setting reservoir size to collection size / min{collection size / (2 * number of segments), + // 256} was effective + let sample_size = source_size / usize::min(source_size / (2 * num_segments), 256); + vprintln!("create_dict: creating {sample_size} byte sample of collection"); + let collection_sample = create_sample(&mut buffered_source, sample_size); + + // A collection of segments to be used in the final dictionary. + // + // Contains the best segment from every epoch. + // Reverse is used because we want a min heap, where + // the lowest scoring items come first + let mut pool: BinaryHeap> = BinaryHeap::new(); + let (_, epoch_size) = compute_epoch_info(¶ms, dict_size, source_size / K); + let num_epochs = source_size / epoch_size; + vprintln!("create_dict: computed epoch info, using {num_epochs} epochs of {epoch_size} bytes"); + //let mut current_epoch = vec![0; epoch_size]; + let mut current_epoch = vec![0; 100]; + let mut epoch_counter = 0; + let mut ctx = Context { + frequencies: HashMap::with_capacity(epoch_size / K), + }; + // Score each segment in the epoch and select the highest scoring segment + // for the pool + while dbg!(buffered_source + .read(&mut current_epoch) + .expect("can read input")) + != 0 + { + epoch_counter += 1; + let best_segment = pick_best_segment(¶ms, &mut ctx, &collection_sample); + vprintln!( + "\tcreate_dict: epoch {epoch_counter}/{num_epochs} has best segment score {}", + best_segment.score + ); + pool.push(Reverse(best_segment)); + // Wipe frequency list for next epoch + ctx.frequencies.clear(); + } + vprintln!( + "create_dict: {epoch_counter} epochs written, writing {} segments", + pool.len() + ); + // Write the dictionary with the highest scoring segment last because + // closer items can be represented with a smaller offset + while let Some(segment) = pool.pop() { + output + .write_all(&segment.0.raw) + .expect("can write to output"); + } +} diff --git a/src/dictionary/reservoir.rs b/src/dictionary/reservoir.rs new file mode 100644 index 000000000..6fb318c91 --- /dev/null +++ b/src/dictionary/reservoir.rs @@ -0,0 +1,144 @@ +use super::cover::K; +use alloc::vec::Vec; +use core::f64::consts::E; +use fastrand; +use std::{io, vec}; + +/// Creates a representative sample of `input` of `size` bytes. +pub fn create_sample(input: &mut R, size: usize) -> Vec { + let reservoir = Reservoir::new(size); + reservoir.fill(input) +} + +/// A reservoir is created from an input stream. +/// +/// Once filled, it will contain a best effort sample of a dataset, +/// where each input value has an equivalent probability of being included. +struct Reservoir { + /// Where the sampled data is stored. + /// + /// Once the lake is filled, then this should contain a representative sample + /// of the larger dataset. + lake: Vec, + /// K is the size of each sample. + /// + /// The original Zstd dictionary implementation states that values + /// between 16 and 2048+ are reasonable. + k: u16, +} + +impl Reservoir { + /// Initialize a new empty reservoir, creating an allocation of `size`. + pub fn new(size: usize) -> Self { + assert!(size >= 16, "Reservoirs cannot be below 16 bytes in size"); + let lake: Vec = vec![0; size]; + let k = K as u16; + Self { lake, k } + } + + /// Filling the reservoir is performed using Algorithm L. + /// + /// The return value is the populated reservoir. + pub fn fill(mut self, source: &mut R) -> Vec { + // https://en.wikipedia.org/wiki/Reservoir_sampling#:~:text=end%0A%20%20end%0Aend-,Optimal%3A%20Algorithm,-L%5Bedit + // https://richardstartin.github.io/posts/reservoir-sampling#algorithm-l:~:text=%3B%0A%20%20%20%20%7D%0A%7D-,Algorithm%20L,-Algorithm%20L%20was + + // First fill the reservoir with the start of the input stream + let mut total_bytes_read: usize = 0; + while let Ok(num_bytes) = source.read(self.lake.as_mut_slice()) { + total_bytes_read += num_bytes; + // Stop when we've completely filled the buffer + if total_bytes_read == self.lake.len() { + break; + } + // If we haven't filled the lake all the way, resize it + if num_bytes == 0 { + self.lake.resize(total_bytes_read, 0); + } + } + + let mut threshold = E.powf(fastrand::f64().ln() / f64::from(self.k)); + // An index into the stream of the next sample to take + let mut next = self.lake.len(); + // Because we're sampling k-mers of size K into the lake, + // split the lake into chunks of k size for simplicity + let mut lake_chunks = self + .lake + .chunks_mut(self.k as usize) + .collect::>(); + // Used when discarding chunks + let end_of_lake = lake_chunks.len(); + let mut counter = end_of_lake / self.k as usize; + // Algorithm L is considered better than algorithm R because it + // determines how many inputs can be skipped, rather than + // processing every input. + + // This is done by abusing the statistics in ways + // I do not understand. + + // Items with a weight smaller than the threshold enter the lake, + // replacing the item in the lake with the largest threshold + + let mut dumpster = Vec::with_capacity(self.k as usize); + loop { + // `num_bytes_read` is kept track of to watch for EOD. + let num_bytes_read: u64; + if counter == next { + num_bytes_read = source + .read(lake_chunks[fastrand::usize(0..end_of_lake)]) + .unwrap() as u64; + // Advance at least to the next sample, skipping forward a few samples + next += ((fastrand::f64().ln() / f64::ln(1.0 - threshold)).floor() as usize + 1) + * self.k as usize; + // Update the threshold to reflect changes + threshold *= E.powf(fastrand::f64().ln() / f64::from(end_of_lake as u32)) + } else { + // Drop the next chunk + num_bytes_read = source.read(&mut dumpster).unwrap() as u64; + //source.seek_relative(self.k.into()).unwrap(); + } + if num_bytes_read == 0 { + break; + } + counter += self.k as usize; + } + self.lake.shrink_to_fit(); + self.lake + } +} + +#[cfg(test)] +mod tests { + use super::Reservoir; + use alloc::vec; + + #[test] + fn initial_fill() { + // Create a reservoir 16 bytes in size and read + // 16 bytes into it + let r = Reservoir::new(16); + let test_data = vec![0_u8; 16]; + let output = r.fill(&mut test_data.as_slice()); + assert_eq!(test_data, output); + } + + #[test] + fn shrinks_for_small_sample() { + // Create a reservoir larger than the sample. + // The output should be smaller. + let r = Reservoir::new(32); + let test_data = vec![0_u8; 28]; + let output = r.fill(&mut test_data.as_slice()); + assert!(output.len() == 28); + } + + #[test] + fn lake_doesnt_grow() { + // Create a sample larger than the reservoir + // The output should be smaller. + let r = Reservoir::new(32); + let test_data = vec![0_u8; 16_000_000]; + let output = r.fill(&mut test_data.as_slice()); + assert!(output.len() == 32); + } +} diff --git a/src/encoding/blocks/compressed.rs b/src/encoding/blocks/compressed.rs index 85593e181..ad7f413cf 100644 --- a/src/encoding/blocks/compressed.rs +++ b/src/encoding/blocks/compressed.rs @@ -8,6 +8,7 @@ use crate::{ huff0::huff0_encoder, }; +/// A block of [`crate::common::BlockType::Compressed`] pub fn compress_block(state: &mut CompressState, output: &mut Vec) { let mut literals_vec = Vec::new(); let mut sequences = Vec::new(); diff --git a/src/encoding/mod.rs b/src/encoding/mod.rs index 33c98a591..2d9797e9b 100644 --- a/src/encoding/mod.rs +++ b/src/encoding/mod.rs @@ -105,7 +105,7 @@ pub enum Sequence<'data> { /// Is encoded as a sequence for the decoder sequence execution. /// /// First the literals will be copied to the decoded data, - /// then `match_len` bytes are copied from `offset` bytes back in the buffer + /// then `match_len` bytes are copied from `offset` bytes back in the decoded data Triple { literals: &'data [u8], offset: usize, diff --git a/src/fse/fse_decoder.rs b/src/fse/fse_decoder.rs index bf573c1b0..7cd59dc6d 100644 --- a/src/fse/fse_decoder.rs +++ b/src/fse/fse_decoder.rs @@ -297,7 +297,7 @@ impl FSETable { }); } - let bytes_read = if br.bits_read() % 8 == 0 { + let bytes_read = if br.bits_read().is_multiple_of(8) { br.bits_read() / 8 } else { (br.bits_read() / 8) + 1 diff --git a/src/huff0/huff0_decoder.rs b/src/huff0/huff0_decoder.rs index 5c3e98bf0..1952aea3c 100644 --- a/src/huff0/huff0_decoder.rs +++ b/src/huff0/huff0_decoder.rs @@ -245,7 +245,7 @@ impl HuffmanTable { let num_weights = header - 127; self.weights.resize(num_weights as usize, 0); - let bytes_needed = if num_weights % 2 == 0 { + let bytes_needed = if num_weights.is_multiple_of(2) { num_weights as usize / 2 } else { (num_weights as usize / 2) + 1 diff --git a/src/lib.rs b/src/lib.rs index 0d87f5ee3..0f85407a4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,8 +7,8 @@ //! //! ## Compression //! The [encoding] module contains the code for compression. -//! Decompression can be achieved by using the [`encoding::compress`]/[`encoding::compress_to_vec`] -//! functions or the [`encoding::FrameCompressor`] +//! Compression can be achieved by using the [`encoding::compress`]/[`encoding::compress_to_vec`] +//! functions or [`encoding::FrameCompressor`] //! #![doc = include_str!("../Readme.md")] #![no_std] @@ -35,6 +35,9 @@ macro_rules! vprintln { mod bit_io; mod common; pub mod decoding; +#[cfg(feature = "dict_builder")] +#[cfg_attr(docsrs, doc(cfg(feature = "dict_builder")))] +pub mod dictionary; pub mod encoding; pub(crate) mod blocks; @@ -49,8 +52,6 @@ pub(crate) mod fse; #[cfg(not(feature = "fuzz_exports"))] pub(crate) mod huff0; -mod tests; - #[cfg(feature = "std")] pub mod io_std; @@ -62,3 +63,5 @@ pub mod io_nostd; #[cfg(not(feature = "std"))] pub use io_nostd as io; + +mod tests; diff --git a/src/tests/decode_corpus.rs b/src/tests/decode_corpus.rs index 049ba1d8b..400b1b4fe 100644 --- a/src/tests/decode_corpus.rs +++ b/src/tests/decode_corpus.rs @@ -7,6 +7,7 @@ fn test_decode_corpus_files() { use alloc::string::{String, ToString}; use alloc::vec::Vec; use std::fs; + use std::io::BufReader; use std::io::Read; use std::println; @@ -82,7 +83,7 @@ fn test_decode_corpus_files() { let mut original_p = p.clone(); original_p.truncate(original_p.len() - 4); - let original_f = fs::File::open(original_p).unwrap(); + let original_f = BufReader::new(fs::File::open(original_p).unwrap()); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); println!("Results for file: {}", p.clone()); diff --git a/src/tests/dict_test.rs b/src/tests/dict_test.rs index 37eca271a..1fd17d011 100644 --- a/src/tests/dict_test.rs +++ b/src/tests/dict_test.rs @@ -83,6 +83,7 @@ fn test_dict_decoding() { use alloc::string::{String, ToString}; use alloc::vec::Vec; use std::fs; + use std::io::BufReader; use std::io::Read; use std::println; @@ -97,7 +98,7 @@ fn test_dict_decoding() { let mut speeds_read = Vec::new(); let mut files: Vec<_> = fs::read_dir("./dict_tests/files").unwrap().collect(); - let dict = fs::File::open("./dict_tests/dictionary").unwrap(); + let dict = BufReader::new(fs::File::open("./dict_tests/dictionary").unwrap()); let dict: Vec = dict.bytes().map(|x| x.unwrap()).collect(); files.sort_by_key(|x| match x { @@ -155,7 +156,7 @@ fn test_dict_decoding() { let mut original_p = p.clone(); original_p.truncate(original_p.len() - 4); - let original_f = fs::File::open(original_p).unwrap(); + let original_f = BufReader::new(fs::File::open(original_p).unwrap()); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); println!("Results for file: {}", p.clone()); diff --git a/src/tests/mod.rs b/src/tests/mod.rs index c0730f535..13090296e 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -130,8 +130,9 @@ fn test_frame_decoder() { fn test_decode_from_to() { use crate::decoding::FrameDecoder; use std::fs::File; + use std::io::BufReader; use std::io::Read; - let f = File::open("./decodecorpus_files/z000088.zst").unwrap(); + let f = BufReader::new(File::open("./decodecorpus_files/z000088.zst").unwrap()); let mut frame_dec = FrameDecoder::new(); let content: Vec = f.bytes().map(|x| x.unwrap()).collect(); @@ -197,7 +198,7 @@ fn test_decode_from_to() { None => std::println!("No checksums to test\n"), } - let original_f = File::open("./decodecorpus_files/z000088").unwrap(); + let original_f = BufReader::new(File::open("./decodecorpus_files/z000088").unwrap()); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); if original.len() != result.len() { @@ -233,6 +234,7 @@ fn test_specific_file() { use crate::decoding::BlockDecodingStrategy; use crate::decoding::FrameDecoder; use std::fs; + use std::io::BufReader; use std::io::Read; let path = "./decodecorpus_files/z000068.zst"; @@ -256,7 +258,7 @@ fn test_specific_file() { .unwrap(); let result = frame_dec.collect().unwrap(); - let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap(); + let original_f = BufReader::new(fs::File::open("./decodecorpus_files/z000088").unwrap()); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); std::println!("Results for file: {path}"); @@ -293,6 +295,7 @@ fn test_specific_file() { #[cfg(feature = "std")] fn test_streaming() { use std::fs; + use std::io::BufReader; use std::io::Read; let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap(); @@ -301,7 +304,7 @@ fn test_streaming() { let mut result = Vec::new(); Read::read_to_end(&mut stream, &mut result).unwrap(); - let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap(); + let original_f = BufReader::new(fs::File::open("./decodecorpus_files/z000088").unwrap()); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); if original.len() != result.len() { @@ -343,7 +346,7 @@ fn test_streaming() { let mut result = Vec::new(); Read::read_to_end(&mut stream, &mut result).unwrap(); - let original_f = fs::File::open("./decodecorpus_files/z000068").unwrap(); + let original_f = BufReader::new(fs::File::open("./decodecorpus_files/z000068").unwrap()); let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); std::println!("Results for file:"); @@ -576,3 +579,10 @@ pub mod dict_test; #[cfg(feature = "std")] pub mod encode_corpus; pub mod fuzz_regressions; + +#[cfg(feature = "std")] +#[test] +fn verbose_disabled() { + use crate::VERBOSE; + assert_eq!(VERBOSE, false); +}