Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ name = "round_trip"
bench = false
test = false

[[example]]
name = "fsst12_round_trip"
bench = false
test = false

[[bench]]
name = "compress"
harness = false
Expand Down
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,24 @@ but it is mostly written from a careful reading of the paper.

**NOTE: This crate only works on little-endian architectures currently. There are no current plans to support big-endian targets.**

## FSST12 variant

The `fsst::fsst12` module implements the 12-bit-code FSST variant from the
[cwida/fsst][MIT-licensed implementation] reference (also mentioned in the
[FastLanes File Format paper][fastlanes]). Codes are 12 bits wide (4096 entries), the first 256
codes are reserved as single-byte identity codes, and there is no escape mechanism. Single-byte
fallbacks still cost 1.5× their plaintext bytes, but the penalty is lighter than classic FSST's
2× escape cost.

```rust
use fsst::fsst12::Compressor12;

let compressor = Compressor12::train(&[b"the quick brown fox".as_slice()]);
let compressed = compressor.compress(b"the quick brown fox");
let decompressed = compressor.decompressor().decompress(&compressed);
assert_eq!(decompressed, b"the quick brown fox");
```

[whitepaper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
[MIT-licensed implementation]: https://github.com/cwida/fsst
[fastlanes]: https://www.vldb.org/pvldb/vol18/p4629-afroozeh.pdf
39 changes: 39 additions & 0 deletions benches/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use criterion::{Criterion, Throughput, criterion_group, criterion_main};

use curl::easy::Easy;
use fsst::Compressor;
use fsst::fsst12::Compressor12;

fn download_dataset(url: &str, path: impl AsRef<Path>) -> Result<(), Box<dyn Error>> {
let target = path.as_ref();
Expand Down Expand Up @@ -95,6 +96,42 @@ fn run_bench(name: &str, buf: &[u8], c: &mut Criterion) {
)
}

fn run_bench_fsst12(name: &str, buf: &[u8], c: &mut Criterion) {
let mut group = c.benchmark_group(format!("fsst12/{name}"));

group.bench_function("train-and-compress", |b| {
b.iter_with_large_drop(|| {
let compressor = Compressor12::train(&[buf]);
compressor.compress(std::hint::black_box(buf))
});
});

let compressor = Compressor12::train(&[buf]);
let mut buffer = Vec::with_capacity(buf.len() * 3 / 2 + 2);
group.throughput(Throughput::Bytes(buf.len() as u64));
group.bench_function("compress-only", |b| {
// SAFETY: `buffer` capacity holds the worst-case FSST12 output.
b.iter(|| unsafe { compressor.compress_into(buf, &mut buffer) });
});

// SAFETY: same as above.
unsafe { compressor.compress_into(buf, &mut buffer) };
let decompressor = compressor.decompressor();
group.bench_function("decompress", |b| {
b.iter_with_large_drop(|| decompressor.decompress(&buffer));
});

group.finish();

let uncompressed_size = buf.len();
let compressed = Compressor12::train(&[buf]).compress(buf);
let cf = (uncompressed_size as f64) / (compressed.len() as f64);
println!(
"fsst12 compressed {name} {uncompressed_size} => {}B (compression factor {cf:.2}:1)",
compressed.len()
);
}

#[allow(clippy::use_debug)]
fn bench_dbtext(c: &mut Criterion) {
fn run_dataset_bench(name: &str, url: &str, path: &str, c: &mut Criterion) {
Expand All @@ -104,6 +141,7 @@ fn bench_dbtext(c: &mut Criterion) {
File::open(path).unwrap().read_to_end(&mut buf).unwrap();

run_bench(name, &buf, c);
run_bench_fsst12(name, &buf, c);
}

run_dataset_bench(
Expand Down Expand Up @@ -136,6 +174,7 @@ fn bench_small_input(c: &mut Criterion) {
}

run_bench("small-input", &buf, c);
run_bench_fsst12("small-input", &buf, c);
}

criterion_group!(compress_bench, bench_dbtext, bench_small_input);
Expand Down
69 changes: 68 additions & 1 deletion benches/micro.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

use criterion::{Criterion, Throughput, criterion_group, criterion_main};

use fsst::fsst12::{Compressor12, CompressorBuilder12};
use fsst::{CompressorBuilder, Symbol};

fn one_megabyte(seed: &[u8]) -> Vec<u8> {
Expand Down Expand Up @@ -225,10 +226,76 @@ fn bench_compress(c: &mut Criterion) {
let _ = std::hint::black_box(output_buf);
}

fn bench_fsst12_micro(c: &mut Criterion) {
let test_string = one_megabyte(b"abcdefgh");
let mut output_buf: Vec<u8> = Vec::with_capacity(test_string.len() * 3 / 2 + 2);
let mut decoded_buf: Vec<u8> = Vec::with_capacity(test_string.len() + 8);

// Best case: one learned 8-byte symbol covers every input position.
let mut group = c.benchmark_group("fsst12/cf=8");
group.throughput(Throughput::Bytes(test_string.len() as u64));
let mut builder = CompressorBuilder12::new();
assert!(builder.insert(Symbol::from_slice(b"abcdefgh"), 8));
let compressor = builder.build();
group.bench_function("compress", |b| {
// SAFETY: output_buf capacity holds the worst-case FSST12 output.
b.iter(|| unsafe { compressor.compress_into(&test_string, &mut output_buf) });
});
// SAFETY: same capacity invariant as above.
unsafe { compressor.compress_into(&test_string, &mut output_buf) };
let decompressor = compressor.decompressor();
group.bench_function("decompress", |b| {
b.iter(|| {
let len = decompressor.decompress_into(&output_buf, decoded_buf.spare_capacity_mut());
// SAFETY: decompress_into initialized exactly `len` bytes.
unsafe { decoded_buf.set_len(len) };
let _ = std::hint::black_box(&decoded_buf);
decoded_buf.clear();
});
});
group.finish();

// Worst case: no learned symbols, every byte through an identity code.
let mut group = c.benchmark_group("fsst12/identity");
group.throughput(Throughput::Bytes(test_string.len() as u64));
let identity_compressor = CompressorBuilder12::new().build();
group.bench_function("compress", |b| {
// SAFETY: output_buf capacity holds the worst-case FSST12 output.
b.iter(|| unsafe { identity_compressor.compress_into(&test_string, &mut output_buf) });
});
// SAFETY: same.
unsafe { identity_compressor.compress_into(&test_string, &mut output_buf) };
let identity_decompressor = identity_compressor.decompressor();
group.bench_function("decompress", |b| {
b.iter(|| {
let len = identity_decompressor
.decompress_into(&output_buf, decoded_buf.spare_capacity_mut());
unsafe { decoded_buf.set_len(len) };
let _ = std::hint::black_box(&decoded_buf);
decoded_buf.clear();
});
});
group.finish();

let mut group = c.benchmark_group("fsst12/train-and-compress");
group.throughput(Throughput::Bytes(test_string.len() as u64));
group.bench_function("1mb-abcdefgh", |b| {
b.iter_with_large_drop(|| {
let compressor = Compressor12::train(&[test_string.as_slice()]);
compressor.compress(std::hint::black_box(&test_string))
});
});
group.finish();

let _ = std::hint::black_box(&output_buf);
let _ = std::hint::black_box(&decoded_buf);
}

criterion_group!(
bench_micro,
bench_compress,
bench_decompress_short,
bench_decompress_escape_heavy
bench_decompress_escape_heavy,
bench_fsst12_micro
);
criterion_main!(bench_micro);
25 changes: 25 additions & 0 deletions examples/fsst12_round_trip.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
//! End-to-end example using the FSST12 (12-bit code) variant.

use core::str;

use fsst::fsst12::Compressor12;

fn main() {
let phrase = "the quick brown fox jumped over the lazy dog. ";
let sample: String = phrase.repeat(32);

let trained = Compressor12::train(&[sample.as_bytes()]);
let compressed = trained.compress(sample.as_bytes());
println!(
"compressed: {} => {} bytes ({} learned symbols, {:.2}:1 ratio)",
sample.len(),
compressed.len(),
trained.symbol_table().len() - 256,
sample.len() as f64 / compressed.len() as f64,
);

let decoded = trained.decompressor().decompress(&compressed);
let output = str::from_utf8(&decoded).unwrap();
assert_eq!(output, sample);
println!("decoded: len={} bytes round-tripped", decoded.len());
}
28 changes: 28 additions & 0 deletions fuzz/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,31 @@ path = "fuzz_targets/fuzz_compress.rs"
test = false
doc = false
bench = false

[[bin]]
name = "fuzz_fsst12_train"
path = "fuzz_targets/fuzz_fsst12_train.rs"
test = false
doc = false
bench = false

[[bin]]
name = "fuzz_fsst12_compress"
path = "fuzz_targets/fuzz_fsst12_compress.rs"
test = false
doc = false
bench = false

[[bin]]
name = "fuzz_train_then_compress"
path = "fuzz_targets/fuzz_train_then_compress.rs"
test = false
doc = false
bench = false

[[bin]]
name = "fuzz_fsst12_train_then_compress"
path = "fuzz_targets/fuzz_fsst12_train_then_compress.rs"
test = false
doc = false
bench = false
3 changes: 3 additions & 0 deletions fuzz/fuzz_targets/fuzz_compress.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#![no_main]

// Trains and compresses on the same buffer, so every byte of the input is in the training
// corpus. PHT-miss and unseen-byte paths are exercised by `fuzz_train_then_compress`.

use libfuzzer_sys::fuzz_target;

fuzz_target!(|data: &[u8]| {
Expand Down
14 changes: 14 additions & 0 deletions fuzz/fuzz_targets/fuzz_fsst12_compress.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#![no_main]

// Trains and compresses on the same buffer, so every byte of the input is in the training
// corpus. PHT-miss and unseen-byte paths are exercised by
// `fuzz_fsst12_train_then_compress`.

use libfuzzer_sys::fuzz_target;

fuzz_target!(|data: &[u8]| {
let compressor = fsst::fsst12::Compressor12::train(&[data]);
let compressed = compressor.compress(data);
let decompressed = compressor.decompressor().decompress(&compressed);
assert_eq!(&decompressed, data);
});
7 changes: 7 additions & 0 deletions fuzz/fuzz_targets/fuzz_fsst12_train.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#![no_main]

use libfuzzer_sys::fuzz_target;

fuzz_target!(|data: &[u8]| {
let _ = fsst::fsst12::Compressor12::train(&[data]);
});
16 changes: 16 additions & 0 deletions fuzz/fuzz_targets/fuzz_fsst12_train_then_compress.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#![no_main]

// Trains on one byte sequence and compresses a separate `payload`. Unlike
// `fuzz_fsst12_compress`, the compress input may contain bytes or sequences the trainer
// never saw, exercising the identity-fallback and PHT-miss code paths.

use libfuzzer_sys::fuzz_target;

fuzz_target!(|input: (Vec<Vec<u8>>, Vec<u8>)| {
let (train_corpus, payload) = input;
let lines: Vec<&[u8]> = train_corpus.iter().map(|v| v.as_slice()).collect();
let compressor = fsst::fsst12::Compressor12::train(&lines);
let compressed = compressor.compress(&payload);
let decompressed = compressor.decompressor().decompress(&compressed);
assert_eq!(decompressed, payload);
});
16 changes: 16 additions & 0 deletions fuzz/fuzz_targets/fuzz_train_then_compress.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#![no_main]

// Trains on one byte sequence and compresses a separate `payload`. Unlike `fuzz_compress`,
// the compress input may contain bytes or sequences the trainer never saw, exercising the
// escape path and PHT-miss code paths.

use libfuzzer_sys::fuzz_target;

fuzz_target!(|input: (Vec<Vec<u8>>, Vec<u8>)| {
let (train_corpus, payload) = input;
let lines: Vec<&[u8]> = train_corpus.iter().map(|v| v.as_slice()).collect();
let compressor = fsst::Compressor::train(&lines);
let compressed = compressor.compress(&payload);
let decompressed = compressor.decompressor().decompress(&compressed);
assert_eq!(decompressed, payload);
});
4 changes: 2 additions & 2 deletions src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -515,14 +515,14 @@ const GENERATIONS: [usize; 5] = [8usize, 38, 68, 98, 128];
#[cfg(miri)]
const GENERATIONS: [usize; 3] = [8usize, 38, 128];

const FSST_SAMPLETARGET: usize = 1 << 14;
pub(crate) const FSST_SAMPLETARGET: usize = 1 << 14;
const FSST_SAMPLELINE: usize = 512;

/// Create a sample from a set of strings in the input.
///
/// The sample is picked based on criteria from the C++ implementation, and it
/// is a vector of subranges of the input strings `str_in`.
fn make_sample<'a>(str_in: &[&'a [u8]], tot_size: usize) -> Vec<&'a [u8]> {
pub(crate) fn make_sample<'a>(str_in: &[&'a [u8]], tot_size: usize) -> Vec<&'a [u8]> {
let mut sample: Vec<&[u8]> = Vec::new();

if tot_size < FSST_SAMPLETARGET {
Expand Down
Loading
Loading