Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
**/*.rs.bk
Cargo.lock
/local_corpus_files
/local_dict_corpus_files
/orig-zstd
fuzz_decodecorpus
perf.data*
Expand Down
13 changes: 12 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ readme = "Readme.md"
keywords = ["zstd", "zstandard", "decompression"]
categories = ["compression"]

[package.metadata.docs.rs]
all-features = true
rustdoc-args = ["--cfg", "docsrs"]

[dependencies]
twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"], optional = true }

Expand All @@ -20,17 +24,20 @@ twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"]
compiler_builtins = { version = "0.1.2", optional = true }
core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" }
alloc = { version = "1.0.0", optional = true, package = "rustc-std-workspace-alloc" }
fastrand = "2.3.0"


[dev-dependencies]
criterion = "0.5"
rand = { version = "0.8.5", features = ["small_rng"] }
zstd = "0.13.2"
zstd = { version = "0.13.2", features = ["zstdmt"]}

[features]
default = ["hash", "std"]
hash = ["dep:twox-hash"]
fuzz_exports = []
std = []
dict_builder = ["std"]

# Internal feature, only used when building as part of libstd, not part of the
# stable interface of this crate.
Expand All @@ -47,3 +54,7 @@ required-features = ["std"]
[[bin]]
name = "zstd_stream"
required-features = ["std"]

[[bin]]
name = "zstd_dict"
required-features = ["std", "dict_builder"]
37 changes: 32 additions & 5 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,49 @@ This crate is currently actively maintained.

# Current Status

Feature complete on the decoder side.
## Decompression
The `decoding` module provides a complete
implementation of a Zstandard decompressor.

In terms of speed, `ruzstd` is behind the original C implementation
which has a rust binding located [here](https://github.com/gyscos/zstd-rs).

Measuring with the 'time' utility the original zstd and my decoder both
decoding the same enwik9.zst file from a ramfs, my decoder is about 3.5
times slower. Enwik9 is highly compressible, for less compressible data
(like a ubuntu installation .iso) my decoder comes close to only being
1.4 times slower.

## Compression
On the compression side:
- Support for generating compressed blocks at any compression level
- [x] Uncompressed
- [x] Fastest (roughly level 1)
- [ ] Default (roughly level 3)
- [ ] Better (roughly level 7)
- [ ] Best (roughly level 11)
- [ ] Checksums
- [x] Checksums
- [ ] Dictionaries

## Speed
In terms of speed this library is behind the original C implementation which has a rust binding located [here](https://github.com/gyscos/zstd-rs).
## Dictionary Generation
When the `dict_builder` feature is enabled, the `dictionary` module
provides the ability to create new dictionaries.

On the `github-users` sample set, our implementation benchmarks within
0.2% of the official implementation (as of commit
`09e52d07340acdb2e13817b066e8be6e424f7258`):
```no_build
uncompressed: 100.00% (7484607 bytes)
no dict: 34.99% of original size (2618872 bytes)
reference dict: 16.16% of no dict size (2195672 bytes smaller)
our dict: 16.28% of no dict size (2192400 bytes smaller)
```

The dictionary generator only provides support for creating "raw
content" dictionaries. Tagged dictionaries are currently unsupported.

Measuring with the 'time' utility the original zstd and my decoder both decoding the same enwik9.zst file from a ramfs, my decoder is about 3.5 times slower. Enwik9 is highly compressible, for less compressible data (like a ubuntu installation .iso) my decoder comes close to only being 1.4 times slower.
See <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format>
for clarification.


# How can you use it?
Expand Down
3 changes: 2 additions & 1 deletion src/bin/zstd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ struct StateTracker {
file_size: u64,
old_percentage: i8,
}

#[allow(unused)]
fn decompress(flags: &[String], file_paths: &[String]) {
if !flags.contains(&"-d".to_owned()) {
eprintln!("This zstd implementation only supports decompression. Please add a \"-d\" flag");
Expand Down Expand Up @@ -128,6 +128,7 @@ fn decompress(flags: &[String], file_paths: &[String]) {
}
}

#[allow(unused)]
struct PercentPrintReader<R: Read> {
total: usize,
counter: usize,
Expand Down
24 changes: 24 additions & 0 deletions src/bin/zstd_dict.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
use ruzstd::dictionary::{create_raw_dict_from_dir, create_raw_dict_from_source};
use std::env::args;
use std::fs::File;
use std::path::Path;

fn main() {
let args: Vec<String> = args().collect();
let input_path: &Path = args.get(1).expect("no input provided").as_ref();
let output_path: &Path = args.get(2).expect("no output path provided").as_ref();
let dict_size = args
.get(3)
.expect("no dict size provided (kb)")
.parse::<usize>()
.expect("dict size was not a valid num");

let mut output = File::create(output_path).unwrap();
if input_path.is_file() {
let source = File::open(input_path).expect("unable to open input path");
let source_size = source.metadata().unwrap().len();
create_raw_dict_from_source(source, source_size as usize, &mut output, dict_size);
} else {
create_raw_dict_from_dir(input_path, &mut output, dict_size).unwrap();
}
}
6 changes: 3 additions & 3 deletions src/bit_io/bit_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ impl<'s> BitReader<'s> {

let mut bit_shift = bits_left_in_current_byte; //this many bits are already set in value

assert!(self.idx % 8 == 0);
assert!(self.idx.is_multiple_of(8));

//collect full bytes
for _ in 0..full_bytes_needed {
Expand Down Expand Up @@ -116,7 +116,7 @@ impl core::fmt::Display for GetBitsError {
} => {
write!(
f,
"Cant serve this request. The reader is limited to {limit} bits, requested {num_requested_bits} bits",
"Cant serve this request. The reader is limited to {limit} bits, requested {num_requested_bits} bits"
)
}
GetBitsError::NotEnoughRemainingBits {
Expand All @@ -125,7 +125,7 @@ impl core::fmt::Display for GetBitsError {
} => {
write!(
f,
"Can\'t read {requested} bits, only have {remaining} bits left",
"Can\'t read {requested} bits, only have {remaining} bits left"
)
}
}
Expand Down
10 changes: 5 additions & 5 deletions src/bit_io/bit_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {

/// Reset to an index. Currently only supports resetting to a byte aligned index
pub fn reset_to(&mut self, index: usize) {
assert!(index % 8 == 0);
assert!(index.is_multiple_of(8));
self.partial = 0;
self.bits_in_partial = 0;
self.bit_idx = index;
Expand All @@ -66,7 +66,7 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {

// We might be changing bits unaligned to byte borders.
// This means the lower bits of the first byte we are touching must stay the same
if idx % 8 != 0 {
if !idx.is_multiple_of(8) {
// How many (upper) bits will change in the first byte?
let bits_in_first_byte = 8 - (idx % 8);
// We don't support only changing a few bits in the middle of a byte
Expand All @@ -82,7 +82,7 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {
idx += bits_in_first_byte;
}

assert!(idx % 8 == 0);
assert!(idx.is_multiple_of(8));
// We are now byte aligned, change idx to byte resolution
let mut idx = idx / 8;

Expand Down Expand Up @@ -113,7 +113,7 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {

/// Flush temporary internal buffers to the output buffer. Only works if this is currently byte aligned
pub fn flush(&mut self) {
assert!(self.bits_in_partial % 8 == 0);
assert!(self.bits_in_partial.is_multiple_of(8));
let full_bytes = self.bits_in_partial / 8;
self.output
.as_mut()
Expand Down Expand Up @@ -204,7 +204,7 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {
/// Returns how many bits are missing for an even byte
pub fn misaligned(&self) -> usize {
let idx = self.index();
if idx % 8 == 0 {
if idx.is_multiple_of(8) {
0
} else {
8 - (idx % 8)
Expand Down
134 changes: 134 additions & 0 deletions src/dictionary/cover.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
//! An implementation of the local maximum coverage algorithm
//! described in the paper "Effective Construction of Relative Lempel-Ziv Dictionaries",
//! by Liao, Petri, Moffat, and Wirth, published under the University of Melbourne.
//!
//! See: <https://people.eng.unimelb.edu.au/ammoffat/abstracts/lpmw16www.pdf>
//!
//! Facebook's implementation was also used as a reference.
//! <https://github.com/facebook/zstd/tree/dev/lib/dictBuilder>

use super::DictParams;
use crate::dictionary::frequency::estimate_frequency;
use core::convert::TryInto;
use std::collections::HashMap;
use std::vec::Vec;

/// The size of each k-mer
pub(super) const K: usize = 16;

///As found under "4: Experiments - Varying k-mer Size" in the original paper,
/// "when k = 16, across all our text collections, there is a reasonable spread"
///
/// Reasonable range: [6, 16]
pub(super) type KMer = [u8; K];

pub struct Segment {
/// The actual contents of the segment.
pub raw: Vec<u8>,
/// A measure of how "ideal" a given segment would be to include in the dictionary
///
/// Higher is better, there's no upper limit. This number is determined by
/// estimating the number of occurances in a given epoch
pub score: usize,
}

impl Eq for Segment {}

impl PartialEq for Segment {
fn eq(&self, other: &Self) -> bool {
// We only really care about score in regards to heap order
self.score == other.score
}
}

impl PartialOrd for Segment {
fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
Some(self.cmp(other))
}
}

impl Ord for Segment {
fn cmp(&self, other: &Self) -> core::cmp::Ordering {
self.score.cmp(&other.score)
}
}

/// A re-usable allocation containing large allocations
/// that are used multiple times during dictionary construction (once per epoch)
pub struct Context {
/// Keeps track of the number of occurances of a particular k-mer within an epoch.
///
/// Reset for each epoch.
pub frequencies: HashMap<KMer, usize>,
}

/// Returns the highest scoring segment in an epoch
/// as a slice of that epoch.
pub fn pick_best_segment(
params: &DictParams,
ctx: &mut Context,
collection_sample: &'_ [u8],
) -> Segment {
let mut segments = collection_sample
.chunks(params.segment_size as usize)
.peekable();
let mut best_segment: &[u8] = segments.peek().expect("at least one segment");
let mut top_segment_score: usize = 0;
// Iterate over segments and score each segment, keeping track of the best segment
for segment in segments {
let segment_score = score_segment(ctx, collection_sample, segment);
if segment_score > top_segment_score {
best_segment = segment;
top_segment_score = segment_score;
}
}

Segment {
raw: best_segment.into(),
score: top_segment_score,
}
}

/// Given a segment, compute the score (or usefulness) of that segment against the entire epoch.
///
/// `score_segment` modifies `ctx.frequencies`.
fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> usize {
let mut segment_score = 0;
// Determine the score of each overlapping k-mer
for i in 0..(segment.len() - K - 1) {
let kmer: &KMer = (&segment[i..i + K])
.try_into()
.expect("Failed to make kmer");
// if the kmer is already in the pool, it recieves a score of zero
if ctx.frequencies.contains_key(kmer) {
continue;
}
let kmer_score = estimate_frequency(kmer, collection_sample);
ctx.frequencies.insert(*kmer, kmer_score);
segment_score += kmer_score;
}

segment_score
}

/// Computes the number of epochs and the size of each epoch.
///
/// Returns a (number of epochs, epoch size) tuple.
///
/// A translation of `COVER_epoch_info_t COVER_computeEpochs()` from facebook/zstd.
pub fn compute_epoch_info(
params: &DictParams,
max_dict_size: usize,
num_kmers: usize,
) -> (usize, usize) {
let min_epoch_size = 10_000; // 10 KiB
let mut num_epochs: usize = usize::max(1, max_dict_size / params.segment_size as usize);
let mut epoch_size: usize = num_kmers / num_epochs;
if epoch_size >= min_epoch_size {
assert!(epoch_size * num_epochs <= num_kmers);
return (num_epochs, epoch_size);
}
epoch_size = usize::min(min_epoch_size, num_kmers);
num_epochs = num_kmers / epoch_size;
(num_epochs, epoch_size)
}
Loading