From 8bb72c9da9e0086dc4992e70e9647da3752ecfff Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Fri, 4 Apr 2025 16:24:47 -0600
Subject: [PATCH 01/16] refactor: move compress_fastest to a new file

---
 src/encoding/frame_compressor.rs | 56 +++++---------------------
 src/encoding/levels/fastest.rs   | 67 ++++++++++++++++++++++++++++++++
 src/encoding/levels/mod.rs       |  2 +
 src/encoding/mod.rs              |  4 +-
 4 files changed, 82 insertions(+), 47 deletions(-)
 create mode 100644 src/encoding/levels/fastest.rs
 create mode 100644 src/encoding/levels/mod.rs
diff --git a/src/encoding/frame_compressor.rs b/src/encoding/frame_compressor.rs
index fe7265ab4..727f917c3 100644
--- a/src/encoding/frame_compressor.rs
+++ b/src/encoding/frame_compressor.rs
@@ -9,14 +9,11 @@ use twox_hash::XxHash64;
 use core::hash::Hasher;
 
 use super::{
-    block_header::BlockHeader, blocks::compress_block, frame_header::FrameHeader,
+    block_header::BlockHeader, frame_header::FrameHeader, levels::*,
     match_generator::MatchGeneratorDriver, CompressionLevel, Matcher,
 };
 
-use crate::{
-    common::MAX_BLOCK_SIZE,
-    io::{Read, Write},
-};
+use crate::io::{Read, Write};
 
 /// An interface for compressing arbitrary data with the ZStandard compression algorithm.
 ///
@@ -106,13 +103,14 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
     /// To avoid endlessly encoding from a potentially endless source (like a network socket) you can use the
     /// [Read::take] function
     pub fn compress(&mut self) {
+        // Clearing buffers to allow re-using of the compressor
         self.state.matcher.reset(self.compression_level);
         self.state.last_huff_table = None;
         let source = self.uncompressed_data.as_mut().unwrap();
         let drain = self.compressed_data.as_mut().unwrap();
-
-        let mut output = Vec::with_capacity(1024 * 130);
-        let output = &mut output;
+        // As the frame is compressed, it's stored here
+        let output: &mut Vec<u8> = &mut Vec::with_capacity(1024 * 130);
+        // First write the frame header
         let header = FrameHeader {
             frame_content_size: None,
             single_segment: false,
@@ -120,10 +118,10 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
             dictionary_id: None,
             window_size: Some(self.state.matcher.window_size()),
         };
-
         header.serialize(output);
-
+        // Now compress block by block
         loop {
+            // Read a single block's worth of uncompressed data from the input
             let mut uncompressed_data = self.state.matcher.get_next_space();
             let mut read_bytes = 0;
             let last_block;
@@ -140,6 +138,7 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
                 }
             }
             uncompressed_data.resize(read_bytes, 0);
+            // As we read, hash that data too
             #[cfg(feature = "hash")]
             self.hasher.write(&uncompressed_data);
             // Special handling is needed for compression of a totally empty file (why you'd want to do that, I don't know)
@@ -168,42 +167,7 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
                     output.extend_from_slice(&uncompressed_data);
                 }
                 CompressionLevel::Fastest => {
-                    if uncompressed_data.iter().all(|x| uncompressed_data[0].eq(x)) {
-                        let rle_byte = uncompressed_data[0];
-                        self.state.matcher.commit_space(uncompressed_data);
-                        self.state.matcher.skip_matching();
-                        let header = BlockHeader {
-                            last_block,
-                            block_type: crate::blocks::block::BlockType::RLE,
-                            block_size: read_bytes.try_into().unwrap(),
-                        };
-                        // Write the header, then the block
-                        header.serialize(output);
-                        output.push(rle_byte);
-                    } else {
-                        let mut compressed = Vec::new();
-                        self.state.matcher.commit_space(uncompressed_data);
-                        compress_block(&mut self.state, &mut compressed);
-                        if compressed.len() >= MAX_BLOCK_SIZE as usize {
-                            let header = BlockHeader {
-                                last_block,
-                                block_type: crate::blocks::block::BlockType::Raw,
-                                block_size: read_bytes.try_into().unwrap(),
-                            };
-                            // Write the header, then the block
-                            header.serialize(output);
-                            output.extend_from_slice(self.state.matcher.get_last_space());
-                        } else {
-                            let header = BlockHeader {
-                                last_block,
-                                block_type: crate::blocks::block::BlockType::Compressed,
-                                block_size: (compressed.len()).try_into().unwrap(),
-                            };
-                            // Write the header, then the block
-                            header.serialize(output);
-                            output.extend(compressed);
-                        }
-                    }
+                    compress_fastest(&mut self.state, last_block, uncompressed_data, output)
                 }
                 _ => {
                     unimplemented!();
diff --git a/src/encoding/levels/fastest.rs b/src/encoding/levels/fastest.rs
new file mode 100644
index 000000000..4ec875727
--- /dev/null
+++ b/src/encoding/levels/fastest.rs
@@ -0,0 +1,67 @@
+use crate::{
+    common::MAX_BLOCK_SIZE,
+    encoding::{
+        block_header::BlockHeader, blocks::compress_block, frame_compressor::CompressState, Matcher,
+    },
+};
+use alloc::vec::Vec;
+
+/// Compresses a single block at [`crate::encoding::CompressionLevel::Fastest`].
+///
+/// # Parameters
+/// - `state`: [`CompressState`] so the compressor can refer to data before
+///   the start of this block
+/// - `last_block`: Whether or not this block is going to be the last block in the frame
+///   (needed because this info is written into the block header)
+/// - `uncompressed_data`: A block's worth of uncompressed data, taken from the
+///   larger input
+/// - `output`: As `uncompressed_data` is compressed, it's appended to `output`.
+#[inline]
+pub fn compress_fastest<M: Matcher>(
+    state: &mut CompressState<M>,
+    last_block: bool,
+    uncompressed_data: Vec<u8>,
+    output: &mut Vec<u8>,
+) {
+    let block_size = uncompressed_data.len() as u32;
+    // First check to see if run length encoding can be used for the entire block
+    if uncompressed_data.iter().all(|x| uncompressed_data[0].eq(x)) {
+        let rle_byte = uncompressed_data[0];
+        state.matcher.commit_space(uncompressed_data);
+        state.matcher.skip_matching();
+        let header = BlockHeader {
+            last_block,
+            block_type: crate::blocks::block::BlockType::RLE,
+            block_size,
+        };
+        // Write the header, then the block
+        header.serialize(output);
+        output.push(rle_byte);
+    } else {
+        // Compress as a standard compressed block
+        let mut compressed = Vec::new();
+        state.matcher.commit_space(uncompressed_data);
+        compress_block(state, &mut compressed);
+        // If the compressed data is larger than the maximum
+        // allowable block size, instead store uncompressed
+        if compressed.len() >= MAX_BLOCK_SIZE as usize {
+            let header = BlockHeader {
+                last_block,
+                block_type: crate::blocks::block::BlockType::Raw,
+                block_size,
+            };
+            // Write the header, then the block
+            header.serialize(output);
+            output.extend_from_slice(state.matcher.get_last_space());
+        } else {
+            let header = BlockHeader {
+                last_block,
+                block_type: crate::blocks::block::BlockType::Compressed,
+                block_size: compressed.len() as u32,
+            };
+            // Write the header, then the block
+            header.serialize(output);
+            output.extend(compressed);
+        }
+    }
+}
diff --git a/src/encoding/levels/mod.rs b/src/encoding/levels/mod.rs
new file mode 100644
index 000000000..fb39caaf8
--- /dev/null
+++ b/src/encoding/levels/mod.rs
@@ -0,0 +1,2 @@
+mod fastest;
+pub use fastest::compress_fastest;
diff --git a/src/encoding/mod.rs b/src/encoding/mod.rs
index 62b1fdd94..33c98a591 100644
--- a/src/encoding/mod.rs
+++ b/src/encoding/mod.rs
@@ -7,6 +7,7 @@ pub(crate) mod match_generator;
 pub(crate) mod util;
 
 mod frame_compressor;
+mod levels;
 pub use frame_compressor::FrameCompressor;
 
 use crate::io::{Read, Write};
@@ -68,7 +69,8 @@ pub enum CompressionLevel {
 /// making their own tradeoffs between runtime, memory usage and compression ratio
 ///
 /// This trait operates on buffers that represent the chunks of data the matching algorithm wants to work on.
-/// One or more of these buffers represent the window the decoder will need to decode the data again.
+/// Each one of these buffers is referred to as a *space*. One or more of these buffers represent the window
+/// the decoder will need to decode the data again.
 ///
 /// This library asks the Matcher for a new buffer using `get_next_space` to allow reusing of allocated buffers when they are no longer part of the
 /// window of data that is being used for matching.

From 4cb43dbabb784c845bd26bd6c02e481af0cd3c19 Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Fri, 18 Apr 2025 10:44:00 -0600
Subject: [PATCH 02/16] sync

---
 src/encoding/blocks/compressed.rs | 1 +
 src/encoding/levels/mod.rs        | 2 ++
 src/encoding/mod.rs               | 2 +-
 src/lib.rs                        | 1 +
 4 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/encoding/blocks/compressed.rs b/src/encoding/blocks/compressed.rs
index 7cf956c28..ad6e89a6d 100644
--- a/src/encoding/blocks/compressed.rs
+++ b/src/encoding/blocks/compressed.rs
@@ -8,6 +8,7 @@ use crate::{
     huff0::huff0_encoder,
 };
 
+/// A block of [`crate::common::BlockType::Compressed`]
 pub fn compress_block<M: Matcher>(state: &mut CompressState<M>, output: &mut Vec<u8>) {
     let mut literals_vec = Vec::new();
     let mut sequences = Vec::new();
diff --git a/src/encoding/levels/mod.rs b/src/encoding/levels/mod.rs
index fb39caaf8..ce6f66bd8 100644
--- a/src/encoding/levels/mod.rs
+++ b/src/encoding/levels/mod.rs
@@ -1,2 +1,4 @@
 mod fastest;
 pub use fastest::compress_fastest;
+mod default;
+pub use default::compress_default;
diff --git a/src/encoding/mod.rs b/src/encoding/mod.rs
index 33c98a591..2d9797e9b 100644
--- a/src/encoding/mod.rs
+++ b/src/encoding/mod.rs
@@ -105,7 +105,7 @@ pub enum Sequence<'data> {
     /// Is encoded as a sequence for the decoder sequence execution.
     ///
     /// First the literals will be copied to the decoded data,
-    /// then `match_len` bytes are copied from `offset` bytes back in the buffer
+    /// then `match_len` bytes are copied from `offset` bytes back in the decoded data
     Triple {
         literals: &'data [u8],
         offset: usize,
diff --git a/src/lib.rs b/src/lib.rs
index 0d87f5ee3..6ca080bba 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -36,6 +36,7 @@ mod bit_io;
 mod common;
 pub mod decoding;
 pub mod encoding;
+mod dictionary;
 
 pub(crate) mod blocks;
 

From 0390c9aa3f08435e2b37ae1598446f5dcbf9efd3 Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Mon, 21 Jul 2025 12:43:01 -0600
Subject: [PATCH 03/16] feat(dict): bare structure of dictionary creation

---
 Cargo.toml                     |   1 +
 src/dictionary/cover.rs        | 127 +++++++++++++++++++++++++++++++
 src/dictionary/frequency.rs    |  71 ++++++++++++++++++
 src/dictionary/mod.rs          |  25 +++++++
 src/dictionary/reservoir.rs    | 133 +++++++++++++++++++++++++++++++++
 src/encoding/levels/default.rs |  27 +++++++
 src/lib.rs                     |   2 +-
 7 files changed, 385 insertions(+), 1 deletion(-)
 create mode 100644 src/dictionary/cover.rs
 create mode 100644 src/dictionary/frequency.rs
 create mode 100644 src/dictionary/mod.rs
 create mode 100644 src/dictionary/reservoir.rs
 create mode 100644 src/encoding/levels/default.rs

diff --git a/Cargo.toml b/Cargo.toml
index e45c2b395..5d5207df7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,6 +20,7 @@ twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"]
 compiler_builtins = { version = "0.1.2", optional = true }
 core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" }
 alloc = { version = "1.0.0", optional = true, package = "rustc-std-workspace-alloc" }
+fastrand = "2.3.0"
 
 [dev-dependencies]
 criterion = "0.5"
diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs
new file mode 100644
index 000000000..0179ce3f9
--- /dev/null
+++ b/src/dictionary/cover.rs
@@ -0,0 +1,127 @@
+//! An implementation of the dictionary generation algorithm
+//! described in the paper "Effective Construction of Relative Lempel-Ziv Dictionaries",
+//! by Liao, Petri, Moffat, and Wirth, published under the University of Melbourne.
+//!
+//! See: https://people.eng.unimelb.edu.au/ammoffat/abstracts/lpmw16www.pdf
+//!
+//! Facebook's implementation was also used as a reference.
+//! https://github.com/facebook/zstd/tree/dev/lib/dictBuilder
+
+use std::collections::HashMap;
+use std::vec::Vec;
+
+use crate::dictionary::frequency::compute_frequency;
+
+/// A set of values that are used during dictionary construction.
+///
+/// Changing these values can improve the resulting dictionary size for certain datasets.
+struct DictParams {
+    /// Segment size.
+    ///
+    /// As found under "4. Experiments - Varying Segment Size" in the original paper, a
+    /// segment size of 2 kiB was effective.
+    ///
+    /// "We explored a range of [segment_size] values and found the performance of LMC is insensitive
+    /// to [segment_size]. We fix [segment_size] to 2kiB
+    ///
+    /// Reasonable range: [16, 2048+]
+    segment_size: u32,
+    /// k-mer size
+    ///
+    ///As found under "4: Experiments - Varying k-mer Size" in the original paper,
+    /// "when k = 16, across all our text collections, there is a reasonable spread"
+    ///
+    /// Reasonable range: [6, 16]
+    ///
+    /// For now this value is ignored, and globally set to 16.
+    k: u32,
+}
+
+struct Segment {
+    /// Relative to the beginning of the epoch,
+    /// the index of the start of the segment
+    starting_offset: u32,
+    /// A measure of how "ideal" a given segment would be to include in the dictionary.
+    score: u32,
+}
+
+/// A re-usable allocation containing large allocations
+/// that are used multiple times during dictionary construction (once per epoch)
+struct Context {
+    /// Keeps track of the number of occurances of a particular k-mer
+    frequencies: HashMap<[u8; 2], usize>,
+    /// A collection of k-mers to be used in the final dictionary
+    pool: Vec<[u8; 2]>,
+}
+
+impl Context {
+    fn new() -> Self {
+        Self {
+            frequencies: HashMap::new(),
+            pool: Vec::new(),
+        }
+    }
+}
+
+/// Returns the highest scoring segment in an epoch
+/// as a slice of that epoch.
+fn pick_best_segment<'epoch>(
+    params: DictParams,
+    ctx: &mut Context,
+    epoch: &'epoch [u8],
+) -> &'epoch [u8] {
+    let mut best_segment: &[u8] = &epoch[0..params.segment_size as usize];
+    let mut top_segment_score = 0;
+    // Iterate over segments and score each segment, keeping track of the best segment
+    for segment in epoch.chunks(params.segment_size as usize) {
+        let segment_score = score_segment(ctx, epoch, segment);
+        if segment_score > top_segment_score {
+            best_segment = segment;
+            top_segment_score = segment_score;
+        }
+    }
+
+    best_segment
+}
+
+/// Given a segment, compute the score (or usefulness) of that segment against the entire epoch.
+///
+/// `score_segment` modifies ctx.frequencies.
+fn score_segment(ctx: &mut Context, epoch: &[u8], segment: &[u8]) -> usize {
+    let mut segment_score = 0;
+    // Determine the score of each overlapping k-mer
+    for i in 0..segment.len() - 1 {
+        let kmer = [segment[i], segment[i + 1]];
+        // if the kmer is already in the pool, it recieves a score of zero
+        if !ctx.frequencies.contains_key(&kmer) {
+            continue;
+        }
+        let kmer_score = compute_frequency(kmer, epoch);
+        ctx.frequencies.insert(kmer, kmer_score);
+        segment_score += kmer_score;
+    }
+
+    segment_score
+}
+
+/// Computes the number of epochs and the size of each epoch.
+///
+/// Returns a (number of epochs, epoch size) tuple.
+///
+/// A translation of `COVER_epoch_info_t COVER_computeEpochs()` from facebook/zstd.
+fn compute_epoch_info(
+    params: DictParams,
+    max_dict_size: usize,
+    num_kmers: usize,
+) -> (usize, usize) {
+    let min_epoch_size = 10_000; // 10 KiB
+    let mut num_epochs: usize = usize::max(1, max_dict_size / params.segment_size as usize);
+    let mut epoch_size: usize = num_kmers / num_epochs;
+    if epoch_size >= min_epoch_size {
+        assert!(epoch_size * num_epochs <= num_kmers);
+        return (num_epochs, epoch_size);
+    }
+    epoch_size = usize::min(min_epoch_size, num_kmers);
+    num_epochs = num_kmers / epoch_size;
+    (num_epochs, epoch_size)
+}
diff --git a/src/dictionary/frequency.rs b/src/dictionary/frequency.rs
new file mode 100644
index 000000000..5661af3dd
--- /dev/null
+++ b/src/dictionary/frequency.rs
@@ -0,0 +1,71 @@
+//! Contains `compute_frequency`, a function
+//! that uses a rolling Karp-Rabin hash to
+//! efficiently count the number of occurences
+//! of a given k-mer within a set.
+
+/// Computes a best effort guess as to how many times `pattern` occurs within
+/// `body`. While not 100% accurate, it will be accurate the vast majority of time
+pub(super) fn compute_frequency(pattern: [u8; 2], body: &[u8]) -> usize {
+    assert!(body.len() >= pattern.len());
+    // A prime number for modulo operations to reduce collisions (q)
+    const PRIME: usize = 2654435761;
+    // Number of characters in the input alphabet (d)
+    const ALPHABET_SIZE: usize = 256;
+    // Hash of input pattern (p)
+    let mut input_hash: usize = 0;
+    // Hash of the current window of text (t)
+    let mut window_hash: usize = 0;
+    // High-order digit multiplier (h)
+    let mut h: usize = 1;
+
+    // Precompute h (?)
+    h = (h * ALPHABET_SIZE) % PRIME;
+
+    // Compute initial hash values
+    for i in 0..pattern.len() {
+        input_hash = (ALPHABET_SIZE * input_hash + pattern[i] as usize) % PRIME;
+        window_hash = (ALPHABET_SIZE * window_hash + body[i] as usize) % PRIME;
+    }
+
+    let mut num_occurances = 0;
+    for i in 0..=body.len() - pattern.len() {
+        // There's *probably* a match if these two match
+        if input_hash == window_hash {
+            num_occurances += 1;
+        }
+
+        // Compute hash values for next window
+        if i < body.len() - pattern.len() {
+            window_hash = (ALPHABET_SIZE * (window_hash - body[i] as usize * h)
+                + body[i + pattern.len()] as usize)
+                % PRIME;
+        }
+    }
+
+    num_occurances
+}
+
+#[cfg(test)]
+mod tests {
+    use super::compute_frequency;
+    #[test]
+    fn dead_beef() {
+        assert_eq!(
+            compute_frequency([0xde, 0xad], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]),
+            2
+        );
+    }
+
+    #[test]
+    fn smallest_body() {
+        assert_eq!(compute_frequency([0x00, 0xff], &[0x00, 0xff]), 1);
+    }
+
+    #[test]
+    fn no_match() {
+        assert_eq!(
+            compute_frequency([0xff, 0xff], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]),
+            0
+        );
+    }
+}
diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs
new file mode 100644
index 000000000..ba700660e
--- /dev/null
+++ b/src/dictionary/mod.rs
@@ -0,0 +1,25 @@
+//! Code for creating a separate content dictionary.
+//!
+//! Implemented following the paper "Effective construction of
+//! Relative Lempel-Ziv Dictionaries", by Kewen Liao, Matthias Petri,
+//! Alistair Moffat, and Anthony Wirth
+
+// The algorithm is summarized here
+// 1. The text is split into "epochs", or chunks from the original source
+// 2. From within each epoch, we select the "segment", or 1 KiB contiguous section
+//    that's predicted to be the best option to include in the dictionary. Concatenated,
+//    these segments form the dictionary.
+//
+// This segment scoring algorithm operates as follows:
+// For a given epoch:
+//  - Run a reservoir sampler over the entire epoch, creating a
+//    reservoir of n/t, where `t` is the desired number of occurances
+//    we want the most common k-mers to have
+//  - Have the ability to estimate
+//    the frequency of a given k-mer: f(w: k-mer) calculates
+//    the frequency of w in the reservoir using a rolling karp-rabin hash
+//  - The score of a segment is the sum of `f(w)` called on every kmer within the segment
+
+mod cover;
+mod frequency;
+mod reservoir;
diff --git a/src/dictionary/reservoir.rs b/src/dictionary/reservoir.rs
new file mode 100644
index 000000000..40cfcd4b8
--- /dev/null
+++ b/src/dictionary/reservoir.rs
@@ -0,0 +1,133 @@
+use crate::io;
+use core::f64::consts::E;
+use std::{dbg, io::ErrorKind};
+use fastrand;
+use alloc::vec::Vec;
+
+/// A reservoir is created from an input stream.
+///
+/// Once filled, it will contain a best effort sample of a dataset,
+/// where each input value has an equivalent probability of being included.
+struct Reservoir {
+    /// Where the sampled data is stored
+    lake: Vec<u8>,
+    /// K is the size of each sample.
+    ///
+    /// The original Zstd dictionary implementation states that values
+    /// between 16 and 2048+ are reasonable.
+    k: u16,
+}
+
+impl Reservoir {
+    pub fn new(size: usize) -> Self {
+        assert!(size >= 16, "Reservoirs cannot be below 16 bytes in size");
+        let mut lake = Vec::with_capacity(size);
+        lake.resize(size, 0);
+        let k: u16 = 16;
+        Self {
+            lake,
+            k
+        }
+    }
+    /// Filling the reservoir is performed using Algorithm L.
+    ///
+    /// The return value is the populated reservoir.
+    pub fn fill<R: io::Read>(mut self, source: &mut R) -> Result<Vec<u8>, io::Error> {
+        // https://en.wikipedia.org/wiki/Reservoir_sampling#:~:text=end%0A%20%20end%0Aend-,Optimal%3A%20Algorithm,-L%5Bedit
+        // https://richardstartin.github.io/posts/reservoir-sampling#algorithm-l:~:text=%3B%0A%20%20%20%20%7D%0A%7D-,Algorithm%20L,-Algorithm%20L%20was
+        // First fill the reservoir with the start of the input stream
+        let mut total_bytes_read: usize = 0;
+        while let Ok(num_bytes) = source.read(self.lake.as_mut_slice()) {
+            total_bytes_read += num_bytes;
+            // Stop when we've completely filled the buffer
+            if total_bytes_read == self.lake.len() {
+                break;
+            }
+            // If we haven't filled the lake all the way, resize it
+            if num_bytes == 0 {
+                self.lake.resize(total_bytes_read, 0);
+            }
+        }
+
+        let mut threshold = E.powf(fastrand::f64().ln() / f64::from(self.k));
+        // An index into the stream of the next sample to take
+        let mut next = self.lake.len();
+        // Because we're sampling k-mers of size K into the lake,
+        // split the lake into chunks of k size for simplicity
+        let mut lake_chunks = self
+            .lake
+            .chunks_mut(self.k as usize)
+            .collect::<Vec<&mut [u8]>>();
+
+        let end_of_lake = lake_chunks.len();
+        let mut counter = end_of_lake / self.k as usize;
+        // Algorithm L is considered better than algorithm R because it
+        // determines how many inputs can be skipped, rather than
+        // processing every input.
+
+        // This is done by abusing the statistics in ways
+        // I do not understand.
+
+        // Items with a weight smaller than the threshold enter the lake,
+        // replacing the item in the lake with the largest threshold
+        let mut dumpster = Vec::with_capacity(self.k as usize);
+        loop {
+            let num_bytes_read;
+            if counter == next {
+                num_bytes_read = source
+                    .read(lake_chunks[fastrand::usize(0..end_of_lake)])
+                    .unwrap();
+                // Advance at least to the next sample, skipping forward a few samples
+                next += ((fastrand::f64().ln() / f64::ln(1.0 - threshold)).floor() as usize + 1)
+                    * self.k as usize;
+                // Update the threshold to reflect changes
+                threshold *= E.powf(fastrand::f64().ln() / f64::from(end_of_lake as u32))
+            } else {
+                // Drop the next chunk
+                num_bytes_read = source.read(&mut dumpster).unwrap();
+            }
+            if num_bytes_read == 0 {
+                break;
+            }
+            counter += self.k as usize;
+        }
+
+        Ok(self.lake)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Reservoir;
+    use alloc::vec;
+
+    #[test]
+    fn initial_fill() {
+        // Create a reservoir 16 bytes in size and read
+        // 16 bytes into it
+        let r = Reservoir::new(16);
+        let test_data = vec![0_u8; 16];
+        let output = r.fill(&mut test_data.as_slice()).unwrap();
+        assert_eq!(test_data, output);
+    }
+
+    #[test]
+    fn shrinks_for_small_sample() {
+        // Create a reservoir larger than the sample.
+        // The output should be smaller.
+        let r = Reservoir::new(32);
+        let test_data = vec![0_u8; 28];
+        let output = r.fill(&mut test_data.as_slice()).unwrap();
+        assert!(output.len() == 28);
+    }
+
+    #[test]
+    fn lake_doesnt_grow() {
+        // Create a sample larger than the reservoir
+        // The output should be smaller.
+        let r = Reservoir::new(32);
+        let test_data = vec![0_u8; 16_000_000];
+        let output = r.fill(&mut test_data.as_slice()).unwrap();
+        assert!(output.len() == 32);
+    }
+}
\ No newline at end of file
diff --git a/src/encoding/levels/default.rs b/src/encoding/levels/default.rs
new file mode 100644
index 000000000..4b83bd246
--- /dev/null
+++ b/src/encoding/levels/default.rs
@@ -0,0 +1,27 @@
+use crate::{
+    common::MAX_BLOCK_SIZE,
+    encoding::{
+        block_header::BlockHeader, blocks::compress_block, frame_compressor::CompressState, Matcher,
+    },
+};
+use alloc::vec::Vec;
+
+/// Compresses a single block at [`crate::encoding::CompressionLevel::Default`].
+///
+/// # Parameters
+/// - `state`: [`CompressState`] so the compressor can refer to data prior to
+///   the start of this block
+/// - `last_block`: Whether or not this block is going to be the last block in the frame
+///   (needed because this info is written into the block header)
+/// - `uncompressed_data`: A block's worth of uncompressed data, taken from the
+///   larger input
+/// - `output`: As `uncompressed_data` is compressed, it's appended to `output`.
+#[inline]
+pub fn compress_default<M: Matcher>(
+    state: &mut CompressState<M>,
+    last_block: bool,
+    uncompressed_data: Vec<u8>,
+    output: &mut Vec<u8>,
+) {
+    let block_size = uncompressed_data.len() as u32;
+}
diff --git a/src/lib.rs b/src/lib.rs
index 6ca080bba..5d456bd12 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -35,8 +35,8 @@ macro_rules! vprintln {
 mod bit_io;
 mod common;
 pub mod decoding;
-pub mod encoding;
 mod dictionary;
+pub mod encoding;
 
 pub(crate) mod blocks;
 

From 31974c1ea08d668bbaec0d667b0160cc3f3c81b5 Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Fri, 25 Jul 2025 12:33:42 -0600
Subject: [PATCH 04/16] .

---
 src/dictionary/cover.rs     | 52 +++++++++++++++++++++----------------
 src/dictionary/frequency.rs |  2 +-
 src/dictionary/reader.rs    |  2 ++
 src/dictionary/reservoir.rs | 13 ++++------
 4 files changed, 38 insertions(+), 31 deletions(-)
 create mode 100644 src/dictionary/reader.rs

diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs
index 0179ce3f9..ad436ffa0 100644
--- a/src/dictionary/cover.rs
+++ b/src/dictionary/cover.rs
@@ -12,6 +12,14 @@ use std::vec::Vec;
 
 use crate::dictionary::frequency::compute_frequency;
 
+/// The size of each k-mer
+pub(super) const K: usize = 16;
+///As found under "4: Experiments - Varying k-mer Size" in the original paper,
+/// "when k = 16, across all our text collections, there is a reasonable spread"
+///
+/// Reasonable range: [6, 16]
+pub(super) type KMer = [u8; K];
+
 /// A set of values that are used during dictionary construction.
 ///
 /// Changing these values can improve the resulting dictionary size for certain datasets.
@@ -26,32 +34,29 @@ struct DictParams {
     ///
     /// Reasonable range: [16, 2048+]
     segment_size: u32,
-    /// k-mer size
-    ///
-    ///As found under "4: Experiments - Varying k-mer Size" in the original paper,
-    /// "when k = 16, across all our text collections, there is a reasonable spread"
-    ///
-    /// Reasonable range: [6, 16]
-    ///
-    /// For now this value is ignored, and globally set to 16.
-    k: u32,
 }
 
 struct Segment {
-    /// Relative to the beginning of the epoch,
-    /// the index of the start of the segment
-    starting_offset: u32,
-    /// A measure of how "ideal" a given segment would be to include in the dictionary.
-    score: u32,
+    /// The actual contents of the segment.
+    raw: Vec<u8>,
+    /// A measure of how "ideal" a given segment would be to include in the dictionary
+    ///
+    /// Higher is better, there's no upper limit. This number is determined by
+    /// estimating the number of occurances in a given epoch
+    score: usize,
 }
 
 /// A re-usable allocation containing large allocations
 /// that are used multiple times during dictionary construction (once per epoch)
 struct Context {
-    /// Keeps track of the number of occurances of a particular k-mer
-    frequencies: HashMap<[u8; 2], usize>,
-    /// A collection of k-mers to be used in the final dictionary
-    pool: Vec<[u8; 2]>,
+    /// Keeps track of the number of occurances of a particular k-mer within an epoch.
+    ///
+    /// Reset for each epoch.
+    frequencies: HashMap<KMer, usize>,
+    /// A collection of segments to be used in the final dictionary.
+    ///
+    /// Contains the best segment from every epoch.
+    pool: Vec<Segment>,
 }
 
 impl Context {
@@ -69,9 +74,9 @@ fn pick_best_segment<'epoch>(
     params: DictParams,
     ctx: &mut Context,
     epoch: &'epoch [u8],
-) -> &'epoch [u8] {
+) -> Segment {
     let mut best_segment: &[u8] = &epoch[0..params.segment_size as usize];
-    let mut top_segment_score = 0;
+    let mut top_segment_score: usize = 0;
     // Iterate over segments and score each segment, keeping track of the best segment
     for segment in epoch.chunks(params.segment_size as usize) {
         let segment_score = score_segment(ctx, epoch, segment);
@@ -81,7 +86,10 @@ fn pick_best_segment<'epoch>(
         }
     }
 
-    best_segment
+    Segment {
+        raw: best_segment.into(),
+        score: top_segment_score,
+    }
 }
 
 /// Given a segment, compute the score (or usefulness) of that segment against the entire epoch.
@@ -91,7 +99,7 @@ fn score_segment(ctx: &mut Context, epoch: &[u8], segment: &[u8]) -> usize {
     let mut segment_score = 0;
     // Determine the score of each overlapping k-mer
     for i in 0..segment.len() - 1 {
-        let kmer = [segment[i], segment[i + 1]];
+        let kmer: &KMer = &(segment[i..i + K].try_into().expect("Failed to make kmer"));
         // if the kmer is already in the pool, it recieves a score of zero
         if !ctx.frequencies.contains_key(&kmer) {
             continue;
diff --git a/src/dictionary/frequency.rs b/src/dictionary/frequency.rs
index 5661af3dd..3c9d489e6 100644
--- a/src/dictionary/frequency.rs
+++ b/src/dictionary/frequency.rs
@@ -5,7 +5,7 @@
 
 /// Computes a best effort guess as to how many times `pattern` occurs within
 /// `body`. While not 100% accurate, it will be accurate the vast majority of time
-pub(super) fn compute_frequency(pattern: [u8; 2], body: &[u8]) -> usize {
+pub(super) fn compute_frequency(pattern: KMer, body: &[u8]) -> usize {
     assert!(body.len() >= pattern.len());
     // A prime number for modulo operations to reduce collisions (q)
     const PRIME: usize = 2654435761;
diff --git a/src/dictionary/reader.rs b/src/dictionary/reader.rs
new file mode 100644
index 000000000..d283ec099
--- /dev/null
+++ b/src/dictionary/reader.rs
@@ -0,0 +1,2 @@
+//! Provides an interface for reading from a large number of files without loading them all into
+//! memory
diff --git a/src/dictionary/reservoir.rs b/src/dictionary/reservoir.rs
index 40cfcd4b8..77b2a9317 100644
--- a/src/dictionary/reservoir.rs
+++ b/src/dictionary/reservoir.rs
@@ -1,8 +1,8 @@
+use super::cover::K;
 use crate::io;
+use alloc::vec::Vec;
 use core::f64::consts::E;
-use std::{dbg, io::ErrorKind};
 use fastrand;
-use alloc::vec::Vec;
 
 /// A reservoir is created from an input stream.
 ///
@@ -23,11 +23,8 @@ impl Reservoir {
         assert!(size >= 16, "Reservoirs cannot be below 16 bytes in size");
         let mut lake = Vec::with_capacity(size);
         lake.resize(size, 0);
-        let k: u16 = 16;
-        Self {
-            lake,
-            k
-        }
+        let k = K as u16;
+        Self { lake, k }
     }
     /// Filling the reservoir is performed using Algorithm L.
     ///
@@ -130,4 +127,4 @@ mod tests {
         let output = r.fill(&mut test_data.as_slice()).unwrap();
         assert!(output.len() == 32);
     }
-}
\ No newline at end of file
+}

From e3e99e061d2ee7e92f7661671677b552a520e3bd Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Tue, 29 Jul 2025 19:52:45 -0600
Subject: [PATCH 05/16] .

---
 src/dictionary/cover.rs     | 21 +++++++++++++++------
 src/dictionary/frequency.rs |  8 ++++----
 src/dictionary/mod.rs       |  8 +++++++-
 src/dictionary/reader.rs    |  2 --
 src/dictionary/reservoir.rs | 37 ++++++++++++++++++++++++++-----------
 5 files changed, 52 insertions(+), 24 deletions(-)
 delete mode 100644 src/dictionary/reader.rs

diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs
index ad436ffa0..a859986da 100644
--- a/src/dictionary/cover.rs
+++ b/src/dictionary/cover.rs
@@ -7,11 +7,13 @@
 //! Facebook's implementation was also used as a reference.
 //! https://github.com/facebook/zstd/tree/dev/lib/dictBuilder
 
+use crate::dictionary::frequency::compute_frequency;
+use crate::dictionary::reservoir::create_sample;
+use core::convert::TryInto;
 use std::collections::HashMap;
+use std::io::Cursor;
 use std::vec::Vec;
 
-use crate::dictionary::frequency::compute_frequency;
-
 /// The size of each k-mer
 pub(super) const K: usize = 16;
 ///As found under "4: Experiments - Varying k-mer Size" in the original paper,
@@ -96,16 +98,23 @@ fn pick_best_segment<'epoch>(
 ///
 /// `score_segment` modifies ctx.frequencies.
 fn score_segment(ctx: &mut Context, epoch: &[u8], segment: &[u8]) -> usize {
+    // Create a reservoir sample of the entire epoch
+    // so we can estimate frequencies without checking the entire epoch
+    // TODO: epoch size / 10 was chosen randomly, find a better way to determine reservoir size
+    let epoch_sample = create_sample(&mut Cursor::new(epoch), epoch.len() / 10);
+
     let mut segment_score = 0;
     // Determine the score of each overlapping k-mer
     for i in 0..segment.len() - 1 {
-        let kmer: &KMer = &(segment[i..i + K].try_into().expect("Failed to make kmer"));
+        let kmer: &KMer = (&segment[i..i + K])
+            .try_into()
+            .expect("Failed to make kmer");
         // if the kmer is already in the pool, it recieves a score of zero
-        if !ctx.frequencies.contains_key(&kmer) {
+        if !ctx.frequencies.contains_key(kmer) {
             continue;
         }
-        let kmer_score = compute_frequency(kmer, epoch);
-        ctx.frequencies.insert(kmer, kmer_score);
+        let kmer_score = compute_frequency(kmer, &epoch_sample);
+        ctx.frequencies.insert(*kmer, kmer_score);
         segment_score += kmer_score;
     }
 
diff --git a/src/dictionary/frequency.rs b/src/dictionary/frequency.rs
index 3c9d489e6..e3035f657 100644
--- a/src/dictionary/frequency.rs
+++ b/src/dictionary/frequency.rs
@@ -5,7 +5,7 @@
 
 /// Computes a best effort guess as to how many times `pattern` occurs within
 /// `body`. While not 100% accurate, it will be accurate the vast majority of time
-pub(super) fn compute_frequency(pattern: KMer, body: &[u8]) -> usize {
+pub(super) fn compute_frequency(pattern: &[u8], body: &[u8]) -> usize {
     assert!(body.len() >= pattern.len());
     // A prime number for modulo operations to reduce collisions (q)
     const PRIME: usize = 2654435761;
@@ -51,20 +51,20 @@ mod tests {
     #[test]
     fn dead_beef() {
         assert_eq!(
-            compute_frequency([0xde, 0xad], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]),
+            compute_frequency(&[0xde, 0xad], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]),
             2
         );
     }
 
     #[test]
     fn smallest_body() {
-        assert_eq!(compute_frequency([0x00, 0xff], &[0x00, 0xff]), 1);
+        assert_eq!(compute_frequency(&[0x00, 0xff], &[0x00, 0xff]), 1);
     }
 
     #[test]
     fn no_match() {
         assert_eq!(
-            compute_frequency([0xff, 0xff], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]),
+            compute_frequency(&[0xff, 0xff], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]),
             0
         );
     }
diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs
index ba700660e..bb031d2e8 100644
--- a/src/dictionary/mod.rs
+++ b/src/dictionary/mod.rs
@@ -19,7 +19,13 @@
 //    the frequency of a given k-mer: f(w: k-mer) calculates
 //    the frequency of w in the reservoir using a rolling karp-rabin hash
 //  - The score of a segment is the sum of `f(w)` called on every kmer within the segment
-
 mod cover;
 mod frequency;
 mod reservoir;
+
+use cover::*;
+use std::io::{Read, Write};
+
+/// Read from `source` to create a dictionary of `dict_size`. The completed dictionary is written
+/// to `output`.
+pub fn create_dict_from_reader<R: Read, W: Write>(source: R, &mut output: W, dict_size: usize) {}
diff --git a/src/dictionary/reader.rs b/src/dictionary/reader.rs
deleted file mode 100644
index d283ec099..000000000
--- a/src/dictionary/reader.rs
+++ /dev/null
@@ -1,2 +0,0 @@
-//! Provides an interface for reading from a large number of files without loading them all into
-//! memory
diff --git a/src/dictionary/reservoir.rs b/src/dictionary/reservoir.rs
index 77b2a9317..5249f87e0 100644
--- a/src/dictionary/reservoir.rs
+++ b/src/dictionary/reservoir.rs
@@ -1,15 +1,24 @@
 use super::cover::K;
-use crate::io;
 use alloc::vec::Vec;
 use core::f64::consts::E;
 use fastrand;
+use std::io;
+
+/// Creates a representative sample of `input` of `size` bytes.
+pub fn create_sample<R: io::Read>(input: &mut R, size: usize) -> Vec<u8> {
+    let reservoir = Reservoir::new(size);
+    reservoir.fill(input)
+}
 
 /// A reservoir is created from an input stream.
 ///
 /// Once filled, it will contain a best effort sample of a dataset,
 /// where each input value has an equivalent probability of being included.
 struct Reservoir {
-    /// Where the sampled data is stored
+    /// Where the sampled data is stored.
+    ///
+    /// Once the lake is filled, then this should contain a representative sample
+    /// of the larger dataset.
     lake: Vec<u8>,
     /// K is the size of each sample.
     ///
@@ -19,6 +28,7 @@ struct Reservoir {
 }
 
 impl Reservoir {
+    /// Initialize a new empty reservoir, creating an allocation of `size`.
     pub fn new(size: usize) -> Self {
         assert!(size >= 16, "Reservoirs cannot be below 16 bytes in size");
         let mut lake = Vec::with_capacity(size);
@@ -26,12 +36,14 @@ impl Reservoir {
         let k = K as u16;
         Self { lake, k }
     }
+
     /// Filling the reservoir is performed using Algorithm L.
     ///
     /// The return value is the populated reservoir.
-    pub fn fill<R: io::Read>(mut self, source: &mut R) -> Result<Vec<u8>, io::Error> {
+    pub fn fill<R: io::Read>(mut self, source: &mut R) -> Vec<u8> {
         // https://en.wikipedia.org/wiki/Reservoir_sampling#:~:text=end%0A%20%20end%0Aend-,Optimal%3A%20Algorithm,-L%5Bedit
         // https://richardstartin.github.io/posts/reservoir-sampling#algorithm-l:~:text=%3B%0A%20%20%20%20%7D%0A%7D-,Algorithm%20L,-Algorithm%20L%20was
+
         // First fill the reservoir with the start of the input stream
         let mut total_bytes_read: usize = 0;
         while let Ok(num_bytes) = source.read(self.lake.as_mut_slice()) {
@@ -55,7 +67,7 @@ impl Reservoir {
             .lake
             .chunks_mut(self.k as usize)
             .collect::<Vec<&mut [u8]>>();
-
+        // Used when discarding chunks
         let end_of_lake = lake_chunks.len();
         let mut counter = end_of_lake / self.k as usize;
         // Algorithm L is considered better than algorithm R because it
@@ -67,13 +79,15 @@ impl Reservoir {
 
         // Items with a weight smaller than the threshold enter the lake,
         // replacing the item in the lake with the largest threshold
+
         let mut dumpster = Vec::with_capacity(self.k as usize);
         loop {
-            let num_bytes_read;
+            // `num_bytes_read` is kept track of to watch for EOD.
+            let num_bytes_read: u64;
             if counter == next {
                 num_bytes_read = source
                     .read(lake_chunks[fastrand::usize(0..end_of_lake)])
-                    .unwrap();
+                    .unwrap() as u64;
                 // Advance at least to the next sample, skipping forward a few samples
                 next += ((fastrand::f64().ln() / f64::ln(1.0 - threshold)).floor() as usize + 1)
                     * self.k as usize;
@@ -81,7 +95,8 @@ impl Reservoir {
                 threshold *= E.powf(fastrand::f64().ln() / f64::from(end_of_lake as u32))
             } else {
                 // Drop the next chunk
-                num_bytes_read = source.read(&mut dumpster).unwrap();
+                num_bytes_read = source.read(&mut dumpster).unwrap() as u64;
+                //source.seek_relative(self.k.into()).unwrap();
             }
             if num_bytes_read == 0 {
                 break;
@@ -89,7 +104,7 @@ impl Reservoir {
             counter += self.k as usize;
         }
 
-        Ok(self.lake)
+        self.lake
     }
 }
 
@@ -104,7 +119,7 @@ mod tests {
         // 16 bytes into it
         let r = Reservoir::new(16);
         let test_data = vec![0_u8; 16];
-        let output = r.fill(&mut test_data.as_slice()).unwrap();
+        let output = r.fill(&mut test_data.as_slice());
         assert_eq!(test_data, output);
     }
 
@@ -114,7 +129,7 @@ mod tests {
         // The output should be smaller.
         let r = Reservoir::new(32);
         let test_data = vec![0_u8; 28];
-        let output = r.fill(&mut test_data.as_slice()).unwrap();
+        let output = r.fill(&mut test_data.as_slice());
         assert!(output.len() == 28);
     }
 
@@ -124,7 +139,7 @@ mod tests {
         // The output should be smaller.
         let r = Reservoir::new(32);
         let test_data = vec![0_u8; 16_000_000];
-        let output = r.fill(&mut test_data.as_slice()).unwrap();
+        let output = r.fill(&mut test_data.as_slice());
         assert!(output.len() == 32);
     }
 }

From 7e0d4a303af1c11bd7a27c1d780c9468e130e399 Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Thu, 31 Jul 2025 20:45:35 -0600
Subject: [PATCH 06/16] dict: more scaffolding for file processing

---
 src/dictionary/cover.rs     | 34 +++++--------------------
 src/dictionary/mod.rs       | 51 +++++++++++++++++++++++++++++++++++--
 src/dictionary/reservoir.rs |  2 +-
 3 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs
index a859986da..f8e319cbb 100644
--- a/src/dictionary/cover.rs
+++ b/src/dictionary/cover.rs
@@ -7,11 +7,12 @@
 //! Facebook's implementation was also used as a reference.
 //! https://github.com/facebook/zstd/tree/dev/lib/dictBuilder
 
+use super::DictParams;
 use crate::dictionary::frequency::compute_frequency;
 use crate::dictionary::reservoir::create_sample;
 use core::convert::TryInto;
 use std::collections::HashMap;
-use std::io::Cursor;
+use std::io::{Cursor, Read};
 use std::vec::Vec;
 
 /// The size of each k-mer
@@ -22,23 +23,7 @@ pub(super) const K: usize = 16;
 /// Reasonable range: [6, 16]
 pub(super) type KMer = [u8; K];
 
-/// A set of values that are used during dictionary construction.
-///
-/// Changing these values can improve the resulting dictionary size for certain datasets.
-struct DictParams {
-    /// Segment size.
-    ///
-    /// As found under "4. Experiments - Varying Segment Size" in the original paper, a
-    /// segment size of 2 kiB was effective.
-    ///
-    /// "We explored a range of [segment_size] values and found the performance of LMC is insensitive
-    /// to [segment_size]. We fix [segment_size] to 2kiB
-    ///
-    /// Reasonable range: [16, 2048+]
-    segment_size: u32,
-}
-
-struct Segment {
+pub struct Segment {
     /// The actual contents of the segment.
     raw: Vec<u8>,
     /// A measure of how "ideal" a given segment would be to include in the dictionary
@@ -72,7 +57,7 @@ impl Context {
 
 /// Returns the highest scoring segment in an epoch
 /// as a slice of that epoch.
-fn pick_best_segment<'epoch>(
+pub fn pick_best_segment<'epoch>(
     params: DictParams,
     ctx: &mut Context,
     epoch: &'epoch [u8],
@@ -97,12 +82,7 @@ fn pick_best_segment<'epoch>(
 /// Given a segment, compute the score (or usefulness) of that segment against the entire epoch.
 ///
 /// `score_segment` modifies ctx.frequencies.
-fn score_segment(ctx: &mut Context, epoch: &[u8], segment: &[u8]) -> usize {
-    // Create a reservoir sample of the entire epoch
-    // so we can estimate frequencies without checking the entire epoch
-    // TODO: epoch size / 10 was chosen randomly, find a better way to determine reservoir size
-    let epoch_sample = create_sample(&mut Cursor::new(epoch), epoch.len() / 10);
-
+fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> usize {
     let mut segment_score = 0;
     // Determine the score of each overlapping k-mer
     for i in 0..segment.len() - 1 {
@@ -113,7 +93,7 @@ fn score_segment(ctx: &mut Context, epoch: &[u8], segment: &[u8]) -> usize {
         if !ctx.frequencies.contains_key(kmer) {
             continue;
         }
-        let kmer_score = compute_frequency(kmer, &epoch_sample);
+        let kmer_score = compute_frequency(kmer, &collection_sample);
         ctx.frequencies.insert(*kmer, kmer_score);
         segment_score += kmer_score;
     }
@@ -126,7 +106,7 @@ fn score_segment(ctx: &mut Context, epoch: &[u8], segment: &[u8]) -> usize {
 /// Returns a (number of epochs, epoch size) tuple.
 ///
 /// A translation of `COVER_epoch_info_t COVER_computeEpochs()` from facebook/zstd.
-fn compute_epoch_info(
+pub fn compute_epoch_info(
     params: DictParams,
     max_dict_size: usize,
     num_kmers: usize,
diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs
index bb031d2e8..b16ca235c 100644
--- a/src/dictionary/mod.rs
+++ b/src/dictionary/mod.rs
@@ -1,9 +1,14 @@
 //! Code for creating a separate content dictionary.
 //!
+//! Effective dictionaries are up to 1% the size of the complete training body,
+//! and are trained on many examples of the original data.
+//!
 //! Implemented following the paper "Effective construction of
 //! Relative Lempel-Ziv Dictionaries", by Kewen Liao, Matthias Petri,
 //! Alistair Moffat, and Anthony Wirth
 
+const GIBIBYTE: usize = 1 << 30;
+
 // The algorithm is summarized here
 // 1. The text is split into "epochs", or chunks from the original source
 // 2. From within each epoch, we select the "segment", or 1 KiB contiguous section
@@ -24,8 +29,50 @@ mod frequency;
 mod reservoir;
 
 use cover::*;
-use std::io::{Read, Write};
+use std::io::{self, BufReader};
+
+use crate::dictionary::reservoir::create_sample;
+
+/// A set of values that are used during dictionary construction.
+///
+/// Changing these values can improve the resulting dictionary size for certain datasets.
+pub struct DictParams {
+    /// Segment size.
+    ///
+    /// As found under "4. Experiments - Varying Segment Size" in the original paper, a
+    /// segment size of 2 kiB was effective.
+    ///
+    /// "We explored a range of [segment_size] values and found the performance of LMC is insensitive
+    /// to [segment_size]. We fix [segment_size] to 2kiB
+    ///
+    /// Reasonable range: [16, 2048+]
+    pub segment_size: u32,
+}
 
 /// Read from `source` to create a dictionary of `dict_size`. The completed dictionary is written
 /// to `output`.
-pub fn create_dict_from_reader<R: Read, W: Write>(source: R, &mut output: W, dict_size: usize) {}
+///
+/// - `source` will be used as training data for the entire dictionary.
+/// - `source_size` influences how the data is divided and sampled and is measured
+///    in bytes. While this does not need to be exact, estimates should attempt to be
+///    larger than the actual collection size.
+/// - `output` is where the completed dictionary will be written.
+/// - `dict_size` determines how large the complete dictionary should be. The completed
+///   dictionary will be this size or smaller.
+///
+/// This function uses `BufRead` internally, the provided reader need not be buffered.
+pub fn create_dict_from_source<R: io::Read, W: io::Write>(
+    source: R,
+    source_size: usize,
+    output: &mut W,
+    dict_size: usize,
+) {
+    let params = DictParams { segment_size: 2048 };
+    let mut buffered_source = BufReader::new(source);
+    let sample_size = buffered_source;
+    let collection_sample = create_sample(&mut buffered_source, 2 * GIBIBYTE);
+    // According to 4. Experiments - Varying Reservoir Sampler Thresholds,
+    // setting reservoir size to collection size / min{collection size / 2 * number of segments,
+    // 256} was effective
+    let (epoch_size, num_epochs) = compute_epoch_info(params, dict_size, num_kmers);
+}
diff --git a/src/dictionary/reservoir.rs b/src/dictionary/reservoir.rs
index 5249f87e0..041ebc498 100644
--- a/src/dictionary/reservoir.rs
+++ b/src/dictionary/reservoir.rs
@@ -103,7 +103,7 @@ impl Reservoir {
             }
             counter += self.k as usize;
         }
-
+        self.lake.shrink_to_fit();
         self.lake
     }
 }

From be49b6dbba51544417ff7ac727f587cf93122f43 Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Thu, 7 Aug 2025 21:08:23 -0600
Subject: [PATCH 07/16] dict: rudimentary implementation

---
 src/bin/zstd.rs             | 81 ++++++++++++++++++++-----------------
 src/dictionary/cover.rs     | 57 +++++++++++++++++---------
 src/dictionary/frequency.rs | 25 +++++++-----
 src/dictionary/mod.rs       | 57 +++++++++++++++++++++++---
 src/lib.rs                  |  4 +-
 5 files changed, 151 insertions(+), 73 deletions(-)

diff --git a/src/bin/zstd.rs b/src/bin/zstd.rs
index bdd80bd7e..07f616308 100644
--- a/src/bin/zstd.rs
+++ b/src/bin/zstd.rs
@@ -9,6 +9,7 @@ use std::time::Instant;
 
 use ruzstd::decoding::errors::FrameDecoderError;
 use ruzstd::decoding::errors::ReadFrameHeaderError;
+use ruzstd::dictionary::create_dict_from_source;
 use ruzstd::encoding::CompressionLevel;
 use ruzstd::encoding::FrameCompressor;
 
@@ -153,44 +154,48 @@ impl<R: Read> Read for PercentPrintReader<R> {
 }
 
 fn main() {
-    let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect();
-    let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect();
-    file_paths.remove(0);
-
-    if flags.is_empty() {
-        let mut encoder = FrameCompressor::new(CompressionLevel::Fastest);
-        encoder.set_drain(Vec::new());
-
-        for path in file_paths {
-            let start_instant = Instant::now();
-            let file = std::fs::File::open(&path).unwrap();
-            let input_len = file.metadata().unwrap().len() as usize;
-            let file = PercentPrintReader {
-                reader: BufReader::new(file),
-                total: input_len,
-                counter: 0,
-                last_percent: 0,
-            };
-            encoder.set_source(file);
-            encoder.compress();
-            let mut output: Vec<_> = encoder.take_drain().unwrap();
-            println!(
-                "Compressed {path:} from {} to {} ({}%) took {}ms",
-                input_len,
-                output.len(),
-                if input_len == 0 {
-                    0
-                } else {
-                    output.len() * 100 / input_len
-                },
-                start_instant.elapsed().as_millis()
-            );
-            output.clear();
-            encoder.set_drain(output);
-        }
-    } else {
-        decompress(&flags, &file_paths);
-    }
+    let input = File::open("local_corpus_files/enwik9").expect("open input file");
+    let input_len = input.metadata().unwrap().len() as usize;
+    let mut output = File::create("output.dict").expect("create output file");
+    create_dict_from_source(input, input_len, &mut output, 5_000_000);
+    //let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect();
+    //let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect();
+    //file_paths.remove(0);
+    //
+    //if flags.is_empty() {
+    //    let mut encoder = FrameCompressor::new(CompressionLevel::Fastest);
+    //    encoder.set_drain(Vec::new());
+    //
+    //    for path in file_paths {
+    //        let start_instant = Instant::now();
+    //        let file = std::fs::File::open(&path).unwrap();
+    //        let input_len = file.metadata().unwrap().len() as usize;
+    //        let file = PercentPrintReader {
+    //            reader: BufReader::new(file),
+    //            total: input_len,
+    //            counter: 0,
+    //            last_percent: 0,
+    //        };
+    //        encoder.set_source(file);
+    //        encoder.compress();
+    //        let mut output: Vec<_> = encoder.take_drain().unwrap();
+    //        println!(
+    //            "Compressed {path:} from {} to {} ({}%) took {}ms",
+    //            input_len,
+    //            output.len(),
+    //            if input_len == 0 {
+    //                0
+    //            } else {
+    //                output.len() * 100 / input_len
+    //            },
+    //            start_instant.elapsed().as_millis()
+    //        );
+    //        output.clear();
+    //        encoder.set_drain(output);
+    //    }
+    //} else {
+    //    decompress(&flags, &file_paths);
+    //}
 }
 
 fn do_something(data: &[u8], s: &mut StateTracker) {
diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs
index f8e319cbb..6cb4d0392 100644
--- a/src/dictionary/cover.rs
+++ b/src/dictionary/cover.rs
@@ -11,7 +11,7 @@ use super::DictParams;
 use crate::dictionary::frequency::compute_frequency;
 use crate::dictionary::reservoir::create_sample;
 use core::convert::TryInto;
-use std::collections::HashMap;
+use std::collections::{BinaryHeap, HashMap};
 use std::io::{Cursor, Read};
 use std::vec::Vec;
 
@@ -25,32 +25,52 @@ pub(super) type KMer = [u8; K];
 
 pub struct Segment {
     /// The actual contents of the segment.
-    raw: Vec<u8>,
+    pub raw: Vec<u8>,
     /// A measure of how "ideal" a given segment would be to include in the dictionary
     ///
     /// Higher is better, there's no upper limit. This number is determined by
     /// estimating the number of occurances in a given epoch
-    score: usize,
+    pub score: usize,
+}
+
+impl Eq for Segment {}
+
+impl PartialEq for Segment {
+    fn eq(&self, other: &Self) -> bool {
+        // We only really care about score in regards to heap order
+        self.score == other.score
+    }
+}
+
+impl PartialOrd for Segment {
+    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
+        match self.score.partial_cmp(&other.score) {
+            Some(core::cmp::Ordering::Equal) => {}
+            ord => return ord,
+        }
+        self.score.partial_cmp(&other.score)
+    }
+}
+
+impl Ord for Segment {
+    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+        self.score.cmp(&other.score)
+    }
 }
 
 /// A re-usable allocation containing large allocations
 /// that are used multiple times during dictionary construction (once per epoch)
-struct Context {
+pub struct Context {
     /// Keeps track of the number of occurances of a particular k-mer within an epoch.
     ///
     /// Reset for each epoch.
-    frequencies: HashMap<KMer, usize>,
-    /// A collection of segments to be used in the final dictionary.
-    ///
-    /// Contains the best segment from every epoch.
-    pool: Vec<Segment>,
+    pub frequencies: HashMap<KMer, usize>,
 }
 
 impl Context {
     fn new() -> Self {
         Self {
             frequencies: HashMap::new(),
-            pool: Vec::new(),
         }
     }
 }
@@ -58,15 +78,16 @@ impl Context {
 /// Returns the highest scoring segment in an epoch
 /// as a slice of that epoch.
 pub fn pick_best_segment<'epoch>(
-    params: DictParams,
+    params: &DictParams,
     ctx: &mut Context,
-    epoch: &'epoch [u8],
+    collection_sample: &'epoch [u8],
 ) -> Segment {
-    let mut best_segment: &[u8] = &epoch[0..params.segment_size as usize];
+    vprintln!("\tpick_best: picking best segment in epoch");
+    let mut best_segment: &[u8] = &collection_sample[0..params.segment_size as usize];
     let mut top_segment_score: usize = 0;
     // Iterate over segments and score each segment, keeping track of the best segment
-    for segment in epoch.chunks(params.segment_size as usize) {
-        let segment_score = score_segment(ctx, epoch, segment);
+    for segment in collection_sample.chunks(params.segment_size as usize) {
+        let segment_score = score_segment(ctx, collection_sample, segment);
         if segment_score > top_segment_score {
             best_segment = segment;
             top_segment_score = segment_score;
@@ -85,12 +106,12 @@ pub fn pick_best_segment<'epoch>(
 fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> usize {
     let mut segment_score = 0;
     // Determine the score of each overlapping k-mer
-    for i in 0..segment.len() - 1 {
+    for i in 0..segment.len() - K - 1 {
         let kmer: &KMer = (&segment[i..i + K])
             .try_into()
             .expect("Failed to make kmer");
         // if the kmer is already in the pool, it recieves a score of zero
-        if !ctx.frequencies.contains_key(kmer) {
+        if ctx.frequencies.contains_key(kmer) {
             continue;
         }
         let kmer_score = compute_frequency(kmer, &collection_sample);
@@ -107,7 +128,7 @@ fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) ->
 ///
 /// A translation of `COVER_epoch_info_t COVER_computeEpochs()` from facebook/zstd.
 pub fn compute_epoch_info(
-    params: DictParams,
+    params: &DictParams,
     max_dict_size: usize,
     num_kmers: usize,
 ) -> (usize, usize) {
diff --git a/src/dictionary/frequency.rs b/src/dictionary/frequency.rs
index e3035f657..ba3ca14d9 100644
--- a/src/dictionary/frequency.rs
+++ b/src/dictionary/frequency.rs
@@ -6,38 +6,45 @@
 /// Computes a best effort guess as to how many times `pattern` occurs within
 /// `body`. While not 100% accurate, it will be accurate the vast majority of time
 pub(super) fn compute_frequency(pattern: &[u8], body: &[u8]) -> usize {
+    //vprintln!(
+    //    "\tkarp-rabin: searching haystack of size {} for needle of size {} with ident {}",
+    //    pattern.len(),
+    //    body.len(),
+    //    pattern[0] + pattern[1]
+    //);
     assert!(body.len() >= pattern.len());
     // A prime number for modulo operations to reduce collisions (q)
-    const PRIME: usize = 2654435761;
+    const PRIME: isize = 2654435761;
     // Number of characters in the input alphabet (d)
-    const ALPHABET_SIZE: usize = 256;
+    const ALPHABET_SIZE: isize = 256;
     // Hash of input pattern (p)
-    let mut input_hash: usize = 0;
+    let mut input_hash: isize = 0;
     // Hash of the current window of text (t)
-    let mut window_hash: usize = 0;
+    let mut window_hash: isize = 0;
     // High-order digit multiplier (h)
-    let mut h: usize = 1;
+    let mut h: isize = 1;
 
     // Precompute h (?)
     h = (h * ALPHABET_SIZE) % PRIME;
 
     // Compute initial hash values
     for i in 0..pattern.len() {
-        input_hash = (ALPHABET_SIZE * input_hash + pattern[i] as usize) % PRIME;
-        window_hash = (ALPHABET_SIZE * window_hash + body[i] as usize) % PRIME;
+        input_hash = (ALPHABET_SIZE * input_hash + pattern[i] as isize) % PRIME;
+        window_hash = (ALPHABET_SIZE * window_hash + body[i] as isize) % PRIME;
     }
 
     let mut num_occurances = 0;
     for i in 0..=body.len() - pattern.len() {
         // There's *probably* a match if these two match
         if input_hash == window_hash {
+            vprintln!("\t\tkarp-rabin: found occurance in sample");
             num_occurances += 1;
         }
 
         // Compute hash values for next window
         if i < body.len() - pattern.len() {
-            window_hash = (ALPHABET_SIZE * (window_hash - body[i] as usize * h)
-                + body[i + pattern.len()] as usize)
+            window_hash = (ALPHABET_SIZE * (window_hash - body[i] as isize * h)
+                + body[i + pattern.len()] as isize)
                 % PRIME;
         }
     }
diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs
index b16ca235c..79273b95b 100644
--- a/src/dictionary/mod.rs
+++ b/src/dictionary/mod.rs
@@ -28,8 +28,13 @@ mod cover;
 mod frequency;
 mod reservoir;
 
+use core::cmp::Reverse;
 use cover::*;
-use std::io::{self, BufReader};
+use std::{
+    collections::{BinaryHeap, HashMap},
+    io::{self, BufReader, Read},
+    vec,
+};
 
 use crate::dictionary::reservoir::create_sample;
 
@@ -67,12 +72,52 @@ pub fn create_dict_from_source<R: io::Read, W: io::Write>(
     output: &mut W,
     dict_size: usize,
 ) {
+    vprintln!("create_dict: creating {dict_size} byte dict from {source_size} byte source");
+    let mut buffered_source = BufReader::with_capacity(5_000_000, source);
+
     let params = DictParams { segment_size: 2048 };
-    let mut buffered_source = BufReader::new(source);
-    let sample_size = buffered_source;
-    let collection_sample = create_sample(&mut buffered_source, 2 * GIBIBYTE);
+    let num_segments = source_size / params.segment_size as usize;
     // According to 4. Experiments - Varying Reservoir Sampler Thresholds,
-    // setting reservoir size to collection size / min{collection size / 2 * number of segments,
+    // setting reservoir size to collection size / min{collection size / (2 * number of segments),
     // 256} was effective
-    let (epoch_size, num_epochs) = compute_epoch_info(params, dict_size, num_kmers);
+    let sample_size = source_size / usize::min(source_size / (2 * num_segments), 256) / 1000;
+    vprintln!("create_dict: creating {sample_size} byte sample of collection");
+    let collection_sample = create_sample(&mut buffered_source, sample_size);
+
+    // A collection of segments to be used in the final dictionary.
+    //
+    // Contains the best segment from every epoch.
+    // Reverse is used because we want a min heap, where
+    // the lowest scoring items come first
+    let mut pool: BinaryHeap<Reverse<Segment>> = BinaryHeap::new();
+    let (num_epochs, epoch_size) = compute_epoch_info(&params, dict_size, source_size / K);
+    vprintln!("create_dict: computed epoch info, using {num_epochs} epochs of {epoch_size} bytes");
+    let mut current_epoch = vec![0; epoch_size];
+    let mut epoch_counter = 0;
+    let mut ctx = Context {
+        frequencies: HashMap::with_capacity(epoch_size / K),
+    };
+    // Score each segment in the epoch and select the highest scoring segment
+    // for the pool
+    while buffered_source
+        .read(&mut current_epoch)
+        .expect("can read input")
+        != 0
+    {
+        epoch_counter += 1;
+        let best_segment = pick_best_segment(&params, &mut ctx, &collection_sample);
+        vprintln!(
+            "\tcreate_dict: epoch {epoch_counter}/{num_epochs} has best segment score {}",
+            best_segment.score
+        );
+        pool.push(Reverse(best_segment));
+        // Wipe frequency list for next epoch
+        ctx.frequencies.clear();
+    }
+    vprintln!("create_dict: writing {} segments", pool.len());
+    // Write the dictionary with the highest scoring segment last because
+    // closer items can be represented with a smaller offset
+    while let Some(segment) = pool.pop() {
+        output.write(&segment.0.raw).expect("can write to output");
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 5d456bd12..49366d80d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -21,7 +21,7 @@ extern crate std;
 extern crate alloc;
 
 #[cfg(feature = "std")]
-pub(crate) const VERBOSE: bool = false;
+pub(crate) const VERBOSE: bool = true;
 
 macro_rules! vprintln {
     ($($x:expr),*) => {
@@ -35,7 +35,7 @@ macro_rules! vprintln {
 mod bit_io;
 mod common;
 pub mod decoding;
-mod dictionary;
+pub mod dictionary;
 pub mod encoding;
 
 pub(crate) mod blocks;

From 09e52d07340acdb2e13817b066e8be6e424f7258 Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Sun, 10 Aug 2025 18:35:24 -0600
Subject: [PATCH 08/16] sync

---
 Cargo.toml                  |  10 ++-
 src/bin/zstd.rs             |   2 +-
 src/bin/zstd_dict.rs        | 133 ++++++++++++++++++++++++++++++++++++
 src/dictionary/cover.rs     |  14 ++--
 src/dictionary/frequency.rs |  17 +++--
 src/dictionary/mod.rs       |  67 +++++++++++++++---
 6 files changed, 217 insertions(+), 26 deletions(-)
 create mode 100644 src/bin/zstd_dict.rs

diff --git a/Cargo.toml b/Cargo.toml
index 5d5207df7..ebd6dbdf7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,11 +21,12 @@ compiler_builtins = { version = "0.1.2", optional = true }
 core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" }
 alloc = { version = "1.0.0", optional = true, package = "rustc-std-workspace-alloc" }
 fastrand = "2.3.0"
+zstd =  { version = "0.13.2", features = ["zstdmt"]}
+
 
 [dev-dependencies]
 criterion = "0.5"
 rand = { version = "0.8.5", features = ["small_rng"] }
-zstd = "0.13.2"
 
 [features]
 default = ["hash", "std"]
@@ -48,3 +49,10 @@ required-features = ["std"]
 [[bin]]
 name = "zstd_stream"
 required-features = ["std"]
+
+[[bin]]
+name = "zstd_dict"
+required-features = ["std"]
+dependancies = [
+    "zstd"
+]
diff --git a/src/bin/zstd.rs b/src/bin/zstd.rs
index 07f616308..3ca7344f1 100644
--- a/src/bin/zstd.rs
+++ b/src/bin/zstd.rs
@@ -157,7 +157,7 @@ fn main() {
     let input = File::open("local_corpus_files/enwik9").expect("open input file");
     let input_len = input.metadata().unwrap().len() as usize;
     let mut output = File::create("output.dict").expect("create output file");
-    create_dict_from_source(input, input_len, &mut output, 5_000_000);
+    create_dict_from_source(input, input_len, &mut output, 1_000_000);
     //let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect();
     //let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect();
     //file_paths.remove(0);
diff --git a/src/bin/zstd_dict.rs b/src/bin/zstd_dict.rs
new file mode 100644
index 000000000..87041f76b
--- /dev/null
+++ b/src/bin/zstd_dict.rs
@@ -0,0 +1,133 @@
+use ruzstd::dictionary::{create_dict_from_dir, create_dict_from_source};
+use std::fmt::Display;
+use std::fs::File;
+use std::io::{self, Cursor, Read, Write};
+use std::path::{Path, PathBuf};
+use std::{env::args, fs};
+
+fn main() {
+    //let args: Vec<String> = args().collect();
+    //let input_path: &Path = args.get(1).expect("no input provided").as_ref();
+    //let output_path: &Path = args.get(2).expect("no output path provided").as_ref();
+    //let dict_size = args
+    //    .get(3)
+    //    .expect("no dict size provided (kb)")
+    //    .parse::<usize>()
+    //    .expect("dict size was not a valid num");
+    //
+    //let mut output = File::create(output_path).unwrap();
+    //if input_path.is_file() {
+    //    let source = File::open(input_path).expect("unable to open input path");
+    //    let source_size = source.metadata().unwrap().len();
+    //    create_dict_from_source(source, source_size as usize, &mut output, dict_size);
+    //} else {
+    //    create_dict_from_dir(input_path, &mut output, dict_size).unwrap();
+    //}
+    print!("{}", bench("local_corpus_files/github/"));
+}
+
+struct BenchmarkResults {
+    uncompressed_size: usize,
+    nodict_size: usize,
+    reference_size: usize,
+    our_size: usize,
+}
+
+impl Display for BenchmarkResults {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "uncompressed: 100.00% ({})", self.uncompressed_size)?;
+        writeln!(
+            f,
+            "no dict: {:.2} ({})",
+            f64::from(self.nodict_size as u32) / f64::from(self.uncompressed_size as u32),
+            self.nodict_size
+        )?;
+        writeln!(
+            f,
+            "reference dict: {:.2} ({})",
+            f64::from(self.reference_size as u32) / f64::from(self.uncompressed_size as u32),
+            self.reference_size
+        )?;
+        write!(
+            f,
+            "our dict: {:.2} ({})",
+            f64::from(self.our_size as u32) / f64::from(self.uncompressed_size as u32),
+            self.our_size
+        )?;
+        Ok(())
+    }
+}
+
+fn bench<P: AsRef<Path>>(input_path: P) -> BenchmarkResults {
+    // At what compression level the dicts are built with
+    let compression_level = 22;
+
+    // 1. Collect a list of a path to every file in the directory into `file_paths`
+    println!("[bench]: collecting list of input files");
+    let mut file_paths: Vec<PathBuf> = Vec::new();
+    let dir: fs::ReadDir = fs::read_dir(&input_path).expect("read input path");
+    fn recurse_read(dir: fs::ReadDir, file_paths: &mut Vec<PathBuf>) -> Result<(), io::Error> {
+        for entry in dir {
+            let entry = entry?;
+            if entry.file_type()?.is_dir() {
+                recurse_read(fs::read_dir(&entry.path())?, file_paths)?;
+            } else {
+                file_paths.push(entry.path());
+            }
+        }
+        Ok(())
+    }
+    recurse_read(dir, &mut file_paths).expect("recursing over input dir");
+
+    // 2. Create two dictionaries, one with our strategy, and one with theirs
+    println!("[bench]: creating reference dict");
+    let reference_dict =
+        zstd::dict::from_files(file_paths.iter(), 112640).expect("create reference dict");
+    let mut our_dict = Vec::with_capacity(112640);
+    println!("[bench]: creating our dict");
+    create_dict_from_dir(input_path, &mut our_dict, 112640).expect("create our dict");
+    // Open each file and compress it
+    let mut uncompressed_size: usize = 0;
+    let mut nodict_size: usize = 0;
+    let mut reference_size: usize = 0;
+    let mut our_size: usize = 0;
+
+    let mut reference_output: Vec<u8> = Vec::with_capacity(128_000);
+    let mut reference_encoder =
+        zstd::Encoder::with_dictionary(&mut reference_output, compression_level, &reference_dict)
+            .unwrap();
+    reference_encoder.multithread(8).unwrap();
+    let mut our_output: Vec<u8> = Vec::with_capacity(128_000);
+    let mut our_encoder =
+        zstd::Encoder::with_dictionary(&mut our_output, compression_level, &our_dict).unwrap();
+    our_encoder.multithread(8).unwrap();
+    for (idx, path) in file_paths.iter().enumerate() {
+        println!("[bench]: compressing file {}/{}", idx + 1, file_paths.len());
+        let mut handle = File::open(path).unwrap();
+        let mut data = Vec::new();
+        handle.read_to_end(&mut data);
+        uncompressed_size += data.len();
+        // Compress with no dict
+        let nodict_output = zstd::encode_all(data.as_slice(), compression_level).unwrap();
+        nodict_size += nodict_output.len();
+        // Compress with the reference dict
+        reference_encoder.write_all(data.as_slice());
+        reference_encoder
+            .do_finish()
+            .expect("reference encoder finishes");
+        reference_size += reference_output.len();
+        reference_output.clear();
+        // Compress with our dict
+        our_encoder.write_all(data.as_slice());
+        our_encoder.finish().expect("our encoder finishes");
+        our_size += our_output.len();
+        our_output.clear();
+    }
+
+    BenchmarkResults {
+        uncompressed_size,
+        nodict_size,
+        reference_size,
+        our_size,
+    }
+}
diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs
index 6cb4d0392..407cc9471 100644
--- a/src/dictionary/cover.rs
+++ b/src/dictionary/cover.rs
@@ -8,7 +8,7 @@
 //! https://github.com/facebook/zstd/tree/dev/lib/dictBuilder
 
 use super::DictParams;
-use crate::dictionary::frequency::compute_frequency;
+use crate::dictionary::frequency::estimate_frequency;
 use crate::dictionary::reservoir::create_sample;
 use core::convert::TryInto;
 use std::collections::{BinaryHeap, HashMap};
@@ -82,11 +82,13 @@ pub fn pick_best_segment<'epoch>(
     ctx: &mut Context,
     collection_sample: &'epoch [u8],
 ) -> Segment {
-    vprintln!("\tpick_best: picking best segment in epoch");
-    let mut best_segment: &[u8] = &collection_sample[0..params.segment_size as usize];
+    let mut segments = collection_sample
+        .chunks(params.segment_size as usize)
+        .peekable();
+    let mut best_segment: &[u8] = &segments.peek().expect("at least one segment");
     let mut top_segment_score: usize = 0;
     // Iterate over segments and score each segment, keeping track of the best segment
-    for segment in collection_sample.chunks(params.segment_size as usize) {
+    for segment in segments {
         let segment_score = score_segment(ctx, collection_sample, segment);
         if segment_score > top_segment_score {
             best_segment = segment;
@@ -106,7 +108,7 @@ pub fn pick_best_segment<'epoch>(
 fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> usize {
     let mut segment_score = 0;
     // Determine the score of each overlapping k-mer
-    for i in 0..segment.len() - K - 1 {
+    for i in 0..(segment.len() - K - 1) {
         let kmer: &KMer = (&segment[i..i + K])
             .try_into()
             .expect("Failed to make kmer");
@@ -114,7 +116,7 @@ fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) ->
         if ctx.frequencies.contains_key(kmer) {
             continue;
         }
-        let kmer_score = compute_frequency(kmer, &collection_sample);
+        let kmer_score = estimate_frequency(kmer, &collection_sample);
         ctx.frequencies.insert(*kmer, kmer_score);
         segment_score += kmer_score;
     }
diff --git a/src/dictionary/frequency.rs b/src/dictionary/frequency.rs
index ba3ca14d9..72aa03531 100644
--- a/src/dictionary/frequency.rs
+++ b/src/dictionary/frequency.rs
@@ -5,7 +5,7 @@
 
 /// Computes a best effort guess as to how many times `pattern` occurs within
 /// `body`. While not 100% accurate, it will be accurate the vast majority of time
-pub(super) fn compute_frequency(pattern: &[u8], body: &[u8]) -> usize {
+pub(super) fn estimate_frequency(pattern: &[u8], body: &[u8]) -> usize {
     //vprintln!(
     //    "\tkarp-rabin: searching haystack of size {} for needle of size {} with ident {}",
     //    pattern.len(),
@@ -18,7 +18,7 @@ pub(super) fn compute_frequency(pattern: &[u8], body: &[u8]) -> usize {
     // Number of characters in the input alphabet (d)
     const ALPHABET_SIZE: isize = 256;
     // Hash of input pattern (p)
-    let mut input_hash: isize = 0;
+    let mut pattern_hash: isize = 0;
     // Hash of the current window of text (t)
     let mut window_hash: isize = 0;
     // High-order digit multiplier (h)
@@ -29,15 +29,14 @@ pub(super) fn compute_frequency(pattern: &[u8], body: &[u8]) -> usize {
 
     // Compute initial hash values
     for i in 0..pattern.len() {
-        input_hash = (ALPHABET_SIZE * input_hash + pattern[i] as isize) % PRIME;
+        pattern_hash = (ALPHABET_SIZE * pattern_hash + pattern[i] as isize) % PRIME;
         window_hash = (ALPHABET_SIZE * window_hash + body[i] as isize) % PRIME;
     }
 
     let mut num_occurances = 0;
     for i in 0..=body.len() - pattern.len() {
         // There's *probably* a match if these two match
-        if input_hash == window_hash {
-            vprintln!("\t\tkarp-rabin: found occurance in sample");
+        if pattern_hash == window_hash {
             num_occurances += 1;
         }
 
@@ -54,24 +53,24 @@ pub(super) fn compute_frequency(pattern: &[u8], body: &[u8]) -> usize {
 
 #[cfg(test)]
 mod tests {
-    use super::compute_frequency;
+    use super::estimate_frequency;
     #[test]
     fn dead_beef() {
         assert_eq!(
-            compute_frequency(&[0xde, 0xad], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]),
+            estimate_frequency(&[0xde, 0xad], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]),
             2
         );
     }
 
     #[test]
     fn smallest_body() {
-        assert_eq!(compute_frequency(&[0x00, 0xff], &[0x00, 0xff]), 1);
+        assert_eq!(estimate_frequency(&[0x00, 0xff], &[0x00, 0xff]), 1);
     }
 
     #[test]
     fn no_match() {
         assert_eq!(
-            compute_frequency(&[0xff, 0xff], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]),
+            estimate_frequency(&[0xff, 0xff], &[0xde, 0xad, 0xbe, 0xef, 0xde, 0xad]),
             0
         );
     }
diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs
index 79273b95b..cabdca906 100644
--- a/src/dictionary/mod.rs
+++ b/src/dictionary/mod.rs
@@ -7,8 +7,6 @@
 //! Relative Lempel-Ziv Dictionaries", by Kewen Liao, Matthias Petri,
 //! Alistair Moffat, and Anthony Wirth
 
-const GIBIBYTE: usize = 1 << 30;
-
 // The algorithm is summarized here
 // 1. The text is split into "epochs", or chunks from the original source
 // 2. From within each epoch, we select the "segment", or 1 KiB contiguous section
@@ -31,11 +29,17 @@ mod reservoir;
 use core::cmp::Reverse;
 use cover::*;
 use std::{
+    boxed::Box,
     collections::{BinaryHeap, HashMap},
+    dbg,
+    fs::{self, File},
     io::{self, BufReader, Read},
-    vec,
+    path::{Path, PathBuf},
+    vec::Vec,
 };
 
+use alloc::vec;
+
 use crate::dictionary::reservoir::create_sample;
 
 /// A set of values that are used during dictionary construction.
@@ -54,6 +58,46 @@ pub struct DictParams {
     pub segment_size: u32,
 }
 
+/// Create a dictionary
+pub fn create_dict_from_dir<P: AsRef<Path>, W: io::Write>(
+    path: P,
+    output: &mut W,
+    dict_size: usize,
+) -> Result<(), io::Error> {
+    // Collect a list of a path to every file in the directory into `file_paths`
+    let mut file_paths: Vec<PathBuf> = Vec::new();
+    let dir: fs::ReadDir = fs::read_dir(path)?;
+    fn recurse_read(dir: fs::ReadDir, file_paths: &mut Vec<PathBuf>) -> Result<(), io::Error> {
+        for entry in dir {
+            let entry = entry?;
+            if entry.file_type()?.is_dir() {
+                recurse_read(fs::read_dir(&entry.path())?, file_paths)?;
+            } else {
+                file_paths.push(entry.path());
+            }
+        }
+        Ok(())
+    }
+    recurse_read(dir, &mut file_paths)?;
+
+    // Open each file and chain the readers together
+    let mut total_file_len: u64 = 0;
+    let mut file_handles: Vec<fs::File> = Vec::new();
+    for path in file_paths {
+        let handle = File::open(path)?;
+        total_file_len += handle.metadata()?.len();
+        file_handles.push(handle);
+    }
+    let empty_reader: Box<dyn Read> = Box::new(io::empty());
+    let chained_files = file_handles
+        .iter()
+        .fold(empty_reader, |acc, reader| Box::new(acc.chain(reader)));
+
+    // Create a dict using the new reader
+    create_dict_from_source(chained_files, total_file_len as usize, output, dict_size);
+    Ok(())
+}
+
 /// Read from `source` to create a dictionary of `dict_size`. The completed dictionary is written
 /// to `output`.
 ///
@@ -80,7 +124,7 @@ pub fn create_dict_from_source<R: io::Read, W: io::Write>(
     // According to 4. Experiments - Varying Reservoir Sampler Thresholds,
     // setting reservoir size to collection size / min{collection size / (2 * number of segments),
     // 256} was effective
-    let sample_size = source_size / usize::min(source_size / (2 * num_segments), 256) / 1000;
+    let sample_size = source_size / usize::min(source_size / (2 * num_segments), 256);
     vprintln!("create_dict: creating {sample_size} byte sample of collection");
     let collection_sample = create_sample(&mut buffered_source, sample_size);
 
@@ -90,18 +134,20 @@ pub fn create_dict_from_source<R: io::Read, W: io::Write>(
     // Reverse is used because we want a min heap, where
     // the lowest scoring items come first
     let mut pool: BinaryHeap<Reverse<Segment>> = BinaryHeap::new();
-    let (num_epochs, epoch_size) = compute_epoch_info(&params, dict_size, source_size / K);
+    let (_, epoch_size) = compute_epoch_info(&params, dict_size, source_size / K);
+    let num_epochs = source_size / epoch_size;
     vprintln!("create_dict: computed epoch info, using {num_epochs} epochs of {epoch_size} bytes");
-    let mut current_epoch = vec![0; epoch_size];
+    //let mut current_epoch = vec![0; epoch_size];
+    let mut current_epoch = vec![0; 100];
     let mut epoch_counter = 0;
     let mut ctx = Context {
         frequencies: HashMap::with_capacity(epoch_size / K),
     };
     // Score each segment in the epoch and select the highest scoring segment
     // for the pool
-    while buffered_source
+    while dbg!(buffered_source
         .read(&mut current_epoch)
-        .expect("can read input")
+        .expect("can read input"))
         != 0
     {
         epoch_counter += 1;
@@ -114,7 +160,10 @@ pub fn create_dict_from_source<R: io::Read, W: io::Write>(
         // Wipe frequency list for next epoch
         ctx.frequencies.clear();
     }
-    vprintln!("create_dict: writing {} segments", pool.len());
+    vprintln!(
+        "create_dict: {epoch_counter} epochs written, writing {} segments",
+        pool.len()
+    );
     // Write the dictionary with the highest scoring segment last because
     // closer items can be represented with a smaller offset
     while let Some(segment) = pool.pop() {

From 34e2e909c621a2f92622ae357f92f027421b5102 Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Sun, 10 Aug 2025 20:05:40 -0600
Subject: [PATCH 09/16] dict: rudimentary implementation complete

---
 src/bin/zstd.rs      |  5 ++-
 src/bin/zstd_dict.rs | 93 +++++++++++++++++++++++++++++++-------------
 2 files changed, 68 insertions(+), 30 deletions(-)

diff --git a/src/bin/zstd.rs b/src/bin/zstd.rs
index 3ca7344f1..21a1feba7 100644
--- a/src/bin/zstd.rs
+++ b/src/bin/zstd.rs
@@ -154,10 +154,11 @@ impl<R: Read> Read for PercentPrintReader<R> {
 }
 
 fn main() {
-    let input = File::open("local_corpus_files/enwik9").expect("open input file");
+    let input = File::open("ik9").expect("open input file");
+    //let input = File::open("local_corpus_files/enwik9").expect("open input file");
     let input_len = input.metadata().unwrap().len() as usize;
     let mut output = File::create("output.dict").expect("create output file");
-    create_dict_from_source(input, input_len, &mut output, 1_000_000);
+    create_dict_from_source(input, input_len, &mut output, 5_000_000);
     //let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect();
     //let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect();
     //file_paths.remove(0);
diff --git a/src/bin/zstd_dict.rs b/src/bin/zstd_dict.rs
index 87041f76b..ab6096887 100644
--- a/src/bin/zstd_dict.rs
+++ b/src/bin/zstd_dict.rs
@@ -1,4 +1,5 @@
 use ruzstd::dictionary::{create_dict_from_dir, create_dict_from_source};
+use std::cell::RefCell;
 use std::fmt::Display;
 use std::fs::File;
 use std::io::{self, Cursor, Read, Write};
@@ -23,7 +24,7 @@ fn main() {
     //} else {
     //    create_dict_from_dir(input_path, &mut output, dict_size).unwrap();
     //}
-    print!("{}", bench("local_corpus_files/github/"));
+    print!("{}", bench("local_corpus_files/sat-txt-files/"));
 }
 
 struct BenchmarkResults {
@@ -38,30 +39,42 @@ impl Display for BenchmarkResults {
         writeln!(f, "uncompressed: 100.00% ({})", self.uncompressed_size)?;
         writeln!(
             f,
-            "no dict: {:.2} ({})",
-            f64::from(self.nodict_size as u32) / f64::from(self.uncompressed_size as u32),
+            "no dict: {:.2}% of original size ({})",
+            f64::from(self.nodict_size as u32) / f64::from(self.uncompressed_size as u32) * 100.0,
             self.nodict_size
         )?;
         writeln!(
             f,
-            "reference dict: {:.2} ({})",
-            f64::from(self.reference_size as u32) / f64::from(self.uncompressed_size as u32),
-            self.reference_size
+            "reference dict: {:.2}% of no dict size ({} bytes smaller)",
+            f64::from(self.reference_size as u32) / f64::from(self.nodict_size as u32) * 100.0,
+            self.nodict_size - self.reference_size
         )?;
         write!(
             f,
-            "our dict: {:.2} ({})",
-            f64::from(self.our_size as u32) / f64::from(self.uncompressed_size as u32),
-            self.our_size
+            "our dict: {:.2}% of no dict size ({} bytes smaller)",
+            f64::from(self.our_size as u32) / f64::from(self.nodict_size as u32) * 100.0,
+            self.nodict_size - self.our_size
         )?;
         Ok(())
     }
 }
 
+struct Dumpster(pub usize);
+
+impl Write for Dumpster {
+    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+        self.0 += buf.len();
+        Ok(buf.len())
+    }
+
+    fn flush(&mut self) -> io::Result<()> {
+        Ok(())
+    }
+}
+
 fn bench<P: AsRef<Path>>(input_path: P) -> BenchmarkResults {
     // At what compression level the dicts are built with
-    let compression_level = 22;
-
+    let compression_level = 1;
     // 1. Collect a list of a path to every file in the directory into `file_paths`
     println!("[bench]: collecting list of input files");
     let mut file_paths: Vec<PathBuf> = Vec::new();
@@ -89,45 +102,69 @@ fn bench<P: AsRef<Path>>(input_path: P) -> BenchmarkResults {
     // Open each file and compress it
     let mut uncompressed_size: usize = 0;
     let mut nodict_size: usize = 0;
-    let mut reference_size: usize = 0;
-    let mut our_size: usize = 0;
 
-    let mut reference_output: Vec<u8> = Vec::with_capacity(128_000);
+    let mut reference_output = Dumpster(0);
     let mut reference_encoder =
         zstd::Encoder::with_dictionary(&mut reference_output, compression_level, &reference_dict)
             .unwrap();
     reference_encoder.multithread(8).unwrap();
-    let mut our_output: Vec<u8> = Vec::with_capacity(128_000);
+    let mut our_output = Dumpster(0);
     let mut our_encoder =
         zstd::Encoder::with_dictionary(&mut our_output, compression_level, &our_dict).unwrap();
     our_encoder.multithread(8).unwrap();
     for (idx, path) in file_paths.iter().enumerate() {
-        println!("[bench]: compressing file {}/{}", idx + 1, file_paths.len());
+        if idx % 10 == 0 {
+            println!("[bench]: compressing file {}/{}", idx + 1, file_paths.len());
+        }
         let mut handle = File::open(path).unwrap();
         let mut data = Vec::new();
-        handle.read_to_end(&mut data);
+        handle.read_to_end(&mut data).unwrap();
         uncompressed_size += data.len();
         // Compress with no dict
         let nodict_output = zstd::encode_all(data.as_slice(), compression_level).unwrap();
         nodict_size += nodict_output.len();
         // Compress with the reference dict
-        reference_encoder.write_all(data.as_slice());
         reference_encoder
-            .do_finish()
-            .expect("reference encoder finishes");
-        reference_size += reference_output.len();
-        reference_output.clear();
+            .write_all(data.as_slice())
+            .expect("reference writer writing");
         // Compress with our dict
-        our_encoder.write_all(data.as_slice());
-        our_encoder.finish().expect("our encoder finishes");
-        our_size += our_output.len();
-        our_output.clear();
+        our_encoder
+            .write_all(data.as_slice())
+            .expect("our writer writing");
     }
+    //println!("[bench]: reading all files");
+    //let mut all_files: Vec<u8> = Vec::with_capacity(1_000_000);
+    //for path in file_paths {
+    //    let mut handle = File::open(path).unwrap();
+    //    handle
+    //        .read_to_end(&mut all_files)
+    //        .expect("reading input file");
+    //}
+    //uncompressed_size = all_files.len();
+    ////    // Compress with no dict
+    //println!("[bench]: compressing using no dict");
+    //let nodict_output = zstd::encode_all(all_files.as_slice(), compression_level).unwrap();
+    //nodict_size = nodict_output.len();
+    //drop(nodict_output);
+    //println!("[bench]: compressing using reference encoder");
+    //reference_encoder
+    //    .write_all(&all_files)
+    //    .expect("writing to reference encoder");
+    //println!("[bench]: compressing using our encoder");
+    //our_encoder
+    //    .write_all(&all_files)
+    //    .expect("writing to our encoder");
+    //our_encoder.do_finish().expect("our encoder finishes");
+    //reference_encoder
+    //    .do_finish()
+    //    .expect("reference encoder finishes");
+    //drop(reference_encoder);
+    //drop(our_encoder);
 
     BenchmarkResults {
         uncompressed_size,
         nodict_size,
-        reference_size,
-        our_size,
+        reference_size: reference_output.0,
+        our_size: our_output.0,
     }
 }

From bfea46ac44a05d7e4f5d7291e2523f95e9eae4fe Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Fri, 15 Aug 2025 10:35:27 -0600
Subject: [PATCH 10/16] dict: pre-clippy auto apply

---
 Cargo.toml                  |   5 +-
 benches/decode_all.rs       |   2 +-
 src/bin/zstd.rs             |  16 +-
 src/bin/zstd_dict.rs        | 294 ++++++++++++++++--------------------
 src/bin/zstd_stream.rs      |   5 +-
 src/dictionary/cover.rs     |  17 +--
 src/dictionary/frequency.rs |   8 +-
 src/dictionary/mod.rs       |  36 +++--
 8 files changed, 173 insertions(+), 210 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index ebd6dbdf7..d66f4490f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,12 +21,12 @@ compiler_builtins = { version = "0.1.2", optional = true }
 core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" }
 alloc = { version = "1.0.0", optional = true, package = "rustc-std-workspace-alloc" }
 fastrand = "2.3.0"
-zstd =  { version = "0.13.2", features = ["zstdmt"]}
 
 
 [dev-dependencies]
 criterion = "0.5"
 rand = { version = "0.8.5", features = ["small_rng"] }
+zstd =  { version = "0.13.2", features = ["zstdmt"]}
 
 [features]
 default = ["hash", "std"]
@@ -53,6 +53,3 @@ required-features = ["std"]
 [[bin]]
 name = "zstd_dict"
 required-features = ["std"]
-dependancies = [
-    "zstd"
-]
diff --git a/benches/decode_all.rs b/benches/decode_all.rs
index 463113958..439986785 100644
--- a/benches/decode_all.rs
+++ b/benches/decode_all.rs
@@ -3,7 +3,7 @@ use ruzstd::decoding::FrameDecoder;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let mut fr = FrameDecoder::new();
-    let mut target_slice = &mut vec![0u8; 1024 * 1024 * 200];
+    let target_slice = &mut vec![0u8; 1024 * 1024 * 200];
     let src = include_bytes!("../decodecorpus_files/z000033.zst");
 
     c.bench_function("decode_all_slice", |b| {
diff --git a/src/bin/zstd.rs b/src/bin/zstd.rs
index 21a1feba7..b55318036 100644
--- a/src/bin/zstd.rs
+++ b/src/bin/zstd.rs
@@ -1,17 +1,13 @@
 extern crate ruzstd;
 use std::fs::File;
-use std::io::BufReader;
 use std::io::Read;
 use std::io::Seek;
 use std::io::SeekFrom;
 use std::io::Write;
-use std::time::Instant;
 
 use ruzstd::decoding::errors::FrameDecoderError;
 use ruzstd::decoding::errors::ReadFrameHeaderError;
 use ruzstd::dictionary::create_dict_from_source;
-use ruzstd::encoding::CompressionLevel;
-use ruzstd::encoding::FrameCompressor;
 
 struct StateTracker {
     bytes_used: u64,
@@ -22,7 +18,7 @@ struct StateTracker {
     file_size: u64,
     old_percentage: i8,
 }
-
+#[allow(unused)]
 fn decompress(flags: &[String], file_paths: &[String]) {
     if !flags.contains(&"-d".to_owned()) {
         eprintln!("This zstd implementation only supports decompression. Please add a \"-d\" flag");
@@ -36,8 +32,7 @@ fn decompress(flags: &[String], file_paths: &[String]) {
 
     if flags.len() != 2 {
         eprintln!(
-            "No flags other than -d and -c are currently implemented. Flags used: {:?}",
-            flags
+            "No flags other than -d and -c are currently implemented. Flags used: {flags:?}"
         );
         return;
     }
@@ -45,7 +40,7 @@ fn decompress(flags: &[String], file_paths: &[String]) {
     let mut frame_dec = ruzstd::decoding::FrameDecoder::new();
 
     for path in file_paths {
-        eprintln!("File: {}", path);
+        eprintln!("File: {path}");
         let mut f = File::open(path).unwrap();
 
         let mut tracker = StateTracker {
@@ -132,6 +127,7 @@ fn decompress(flags: &[String], file_paths: &[String]) {
     }
 }
 
+#[allow(unused)]
 struct PercentPrintReader<R: Read> {
     total: usize,
     counter: usize,
@@ -147,7 +143,7 @@ impl<R: Read> Read for PercentPrintReader<R> {
         if progress > self.last_percent {
             self.last_percent = progress;
             eprint!("\r");
-            eprint!("{} % done", progress);
+            eprint!("{progress} % done");
         }
         Ok(new_bytes)
     }
@@ -207,7 +203,7 @@ fn do_something(data: &[u8], s: &mut StateTracker) {
     let percentage = (s.file_pos * 100) / s.file_size;
     if percentage as i8 != s.old_percentage {
         eprint!("\r");
-        eprint!("{} % done", percentage);
+        eprint!("{percentage} % done");
         s.old_percentage = percentage as i8;
     }
 }
diff --git a/src/bin/zstd_dict.rs b/src/bin/zstd_dict.rs
index ab6096887..f502700e7 100644
--- a/src/bin/zstd_dict.rs
+++ b/src/bin/zstd_dict.rs
@@ -1,170 +1,140 @@
 use ruzstd::dictionary::{create_dict_from_dir, create_dict_from_source};
-use std::cell::RefCell;
-use std::fmt::Display;
 use std::fs::File;
-use std::io::{self, Cursor, Read, Write};
-use std::path::{Path, PathBuf};
-use std::{env::args, fs};
+use std::path::Path;
+use std::env::args;
 
 fn main() {
-    //let args: Vec<String> = args().collect();
-    //let input_path: &Path = args.get(1).expect("no input provided").as_ref();
-    //let output_path: &Path = args.get(2).expect("no output path provided").as_ref();
-    //let dict_size = args
-    //    .get(3)
-    //    .expect("no dict size provided (kb)")
-    //    .parse::<usize>()
-    //    .expect("dict size was not a valid num");
-    //
-    //let mut output = File::create(output_path).unwrap();
-    //if input_path.is_file() {
-    //    let source = File::open(input_path).expect("unable to open input path");
-    //    let source_size = source.metadata().unwrap().len();
-    //    create_dict_from_source(source, source_size as usize, &mut output, dict_size);
-    //} else {
-    //    create_dict_from_dir(input_path, &mut output, dict_size).unwrap();
-    //}
-    print!("{}", bench("local_corpus_files/sat-txt-files/"));
-}
-
-struct BenchmarkResults {
-    uncompressed_size: usize,
-    nodict_size: usize,
-    reference_size: usize,
-    our_size: usize,
-}
-
-impl Display for BenchmarkResults {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        writeln!(f, "uncompressed: 100.00% ({})", self.uncompressed_size)?;
-        writeln!(
-            f,
-            "no dict: {:.2}% of original size ({})",
-            f64::from(self.nodict_size as u32) / f64::from(self.uncompressed_size as u32) * 100.0,
-            self.nodict_size
-        )?;
-        writeln!(
-            f,
-            "reference dict: {:.2}% of no dict size ({} bytes smaller)",
-            f64::from(self.reference_size as u32) / f64::from(self.nodict_size as u32) * 100.0,
-            self.nodict_size - self.reference_size
-        )?;
-        write!(
-            f,
-            "our dict: {:.2}% of no dict size ({} bytes smaller)",
-            f64::from(self.our_size as u32) / f64::from(self.nodict_size as u32) * 100.0,
-            self.nodict_size - self.our_size
-        )?;
-        Ok(())
-    }
-}
-
-struct Dumpster(pub usize);
-
-impl Write for Dumpster {
-    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
-        self.0 += buf.len();
-        Ok(buf.len())
-    }
+    let args: Vec<String> = args().collect();
+    let input_path: &Path = args.get(1).expect("no input provided").as_ref();
+    let output_path: &Path = args.get(2).expect("no output path provided").as_ref();
+    let dict_size = args
+        .get(3)
+        .expect("no dict size provided (kb)")
+        .parse::<usize>()
+        .expect("dict size was not a valid num");
 
-    fn flush(&mut self) -> io::Result<()> {
-        Ok(())
+    let mut output = File::create(output_path).unwrap();
+    if input_path.is_file() {
+        let source = File::open(input_path).expect("unable to open input path");
+        let source_size = source.metadata().unwrap().len();
+        create_dict_from_source(source, source_size as usize, &mut output, dict_size);
+    } else {
+        create_dict_from_dir(input_path, &mut output, dict_size).unwrap();
     }
 }
 
-fn bench<P: AsRef<Path>>(input_path: P) -> BenchmarkResults {
-    // At what compression level the dicts are built with
-    let compression_level = 1;
-    // 1. Collect a list of a path to every file in the directory into `file_paths`
-    println!("[bench]: collecting list of input files");
-    let mut file_paths: Vec<PathBuf> = Vec::new();
-    let dir: fs::ReadDir = fs::read_dir(&input_path).expect("read input path");
-    fn recurse_read(dir: fs::ReadDir, file_paths: &mut Vec<PathBuf>) -> Result<(), io::Error> {
-        for entry in dir {
-            let entry = entry?;
-            if entry.file_type()?.is_dir() {
-                recurse_read(fs::read_dir(&entry.path())?, file_paths)?;
-            } else {
-                file_paths.push(entry.path());
-            }
-        }
-        Ok(())
-    }
-    recurse_read(dir, &mut file_paths).expect("recursing over input dir");
-
-    // 2. Create two dictionaries, one with our strategy, and one with theirs
-    println!("[bench]: creating reference dict");
-    let reference_dict =
-        zstd::dict::from_files(file_paths.iter(), 112640).expect("create reference dict");
-    let mut our_dict = Vec::with_capacity(112640);
-    println!("[bench]: creating our dict");
-    create_dict_from_dir(input_path, &mut our_dict, 112640).expect("create our dict");
-    // Open each file and compress it
-    let mut uncompressed_size: usize = 0;
-    let mut nodict_size: usize = 0;
-
-    let mut reference_output = Dumpster(0);
-    let mut reference_encoder =
-        zstd::Encoder::with_dictionary(&mut reference_output, compression_level, &reference_dict)
-            .unwrap();
-    reference_encoder.multithread(8).unwrap();
-    let mut our_output = Dumpster(0);
-    let mut our_encoder =
-        zstd::Encoder::with_dictionary(&mut our_output, compression_level, &our_dict).unwrap();
-    our_encoder.multithread(8).unwrap();
-    for (idx, path) in file_paths.iter().enumerate() {
-        if idx % 10 == 0 {
-            println!("[bench]: compressing file {}/{}", idx + 1, file_paths.len());
-        }
-        let mut handle = File::open(path).unwrap();
-        let mut data = Vec::new();
-        handle.read_to_end(&mut data).unwrap();
-        uncompressed_size += data.len();
-        // Compress with no dict
-        let nodict_output = zstd::encode_all(data.as_slice(), compression_level).unwrap();
-        nodict_size += nodict_output.len();
-        // Compress with the reference dict
-        reference_encoder
-            .write_all(data.as_slice())
-            .expect("reference writer writing");
-        // Compress with our dict
-        our_encoder
-            .write_all(data.as_slice())
-            .expect("our writer writing");
-    }
-    //println!("[bench]: reading all files");
-    //let mut all_files: Vec<u8> = Vec::with_capacity(1_000_000);
-    //for path in file_paths {
-    //    let mut handle = File::open(path).unwrap();
-    //    handle
-    //        .read_to_end(&mut all_files)
-    //        .expect("reading input file");
-    //}
-    //uncompressed_size = all_files.len();
-    ////    // Compress with no dict
-    //println!("[bench]: compressing using no dict");
-    //let nodict_output = zstd::encode_all(all_files.as_slice(), compression_level).unwrap();
-    //nodict_size = nodict_output.len();
-    //drop(nodict_output);
-    //println!("[bench]: compressing using reference encoder");
-    //reference_encoder
-    //    .write_all(&all_files)
-    //    .expect("writing to reference encoder");
-    //println!("[bench]: compressing using our encoder");
-    //our_encoder
-    //    .write_all(&all_files)
-    //    .expect("writing to our encoder");
-    //our_encoder.do_finish().expect("our encoder finishes");
-    //reference_encoder
-    //    .do_finish()
-    //    .expect("reference encoder finishes");
-    //drop(reference_encoder);
-    //drop(our_encoder);
-
-    BenchmarkResults {
-        uncompressed_size,
-        nodict_size,
-        reference_size: reference_output.0,
-        our_size: our_output.0,
-    }
-}
+//struct BenchmarkResults {
+//    pub uncompressed_size: usize,
+//    pub nodict_size: usize,
+//    pub reference_size: usize,
+//    pub our_size: usize,
+//}
+//
+//impl Display for BenchmarkResults {
+//    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+//        writeln!(f, "uncompressed: 100.00% ({})", self.uncompressed_size)?;
+//        writeln!(
+//            f,
+//            "no dict: {:.2}% of original size ({})",
+//            f64::from(self.nodict_size as u32) / f64::from(self.uncompressed_size as u32) * 100.0,
+//            self.nodict_size
+//        )?;
+//        writeln!(
+//            f,
+//            "reference dict: {:.2}% of no dict size ({} bytes smaller)",
+//            f64::from(self.reference_size as u32) / f64::from(self.nodict_size as u32) * 100.0,
+//            self.nodict_size - self.reference_size
+//        )?;
+//        write!(
+//            f,
+//            "our dict: {:.2}% of no dict size ({} bytes smaller)",
+//            f64::from(self.our_size as u32) / f64::from(self.nodict_size as u32) * 100.0,
+//            self.nodict_size - self.our_size
+//        )?;
+//        Ok(())
+//    }
+//}
+//
+//struct Dumpster(pub usize);
+//
+//impl Write for Dumpster {
+//    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+//        self.0 += buf.len();
+//        Ok(buf.len())
+//    }
+//
+//    fn flush(&mut self) -> io::Result<()> {
+//        Ok(())
+//    }
+//}
+//
+///// Compares compression ratios achieved with a dictionary
+//#[allow(unused)]
+//fn bench<P: AsRef<Path>>(input_path: P) -> BenchmarkResults {
+//    // At what compression level the dicts are built with
+//    let compression_level = 1;
+//    // 1. Collect a list of a path to every file in the directory into `file_paths`
+//    println!("[bench]: collecting list of input files");
+//    let mut file_paths: Vec<PathBuf> = Vec::new();
+//    let dir: fs::ReadDir = fs::read_dir(&input_path).expect("read input path");
+//    fn recurse_read(dir: fs::ReadDir, file_paths: &mut Vec<PathBuf>) -> Result<(), io::Error> {
+//        for entry in dir {
+//            let entry = entry?;
+//            if entry.file_type()?.is_dir() {
+//                recurse_read(fs::read_dir(&entry.path())?, file_paths)?;
+//            } else {
+//                file_paths.push(entry.path());
+//            }
+//        }
+//        Ok(())
+//    }
+//    recurse_read(dir, &mut file_paths).expect("recursing over input dir");
+//
+//    // 2. Create two dictionaries, one with our strategy, and one with theirs
+//    println!("[bench]: creating reference dict");
+//    let reference_dict =
+//        zstd::dict::from_files(file_paths.iter(), 112640).expect("create reference dict");
+//    let mut our_dict = Vec::with_capacity(112640);
+//    println!("[bench]: creating our dict");
+//    create_dict_from_dir(input_path, &mut our_dict, 112640).expect("create our dict");
+//    // Open each file and compress it
+//    let mut uncompressed_size: usize = 0;
+//    let mut nodict_size: usize = 0;
+//
+//    let mut reference_output = Dumpster(0);
+//    let mut reference_encoder =
+//        zstd::Encoder::with_dictionary(&mut reference_output, compression_level, &reference_dict)
+//            .unwrap();
+//    reference_encoder.multithread(8).unwrap();
+//    let mut our_output = Dumpster(0);
+//    let mut our_encoder =
+//        zstd::Encoder::with_dictionary(&mut our_output, compression_level, &our_dict).unwrap();
+//    our_encoder.multithread(8).unwrap();
+//    for (idx, path) in file_paths.iter().enumerate() {
+//        if idx % 10 == 0 {
+//            println!("[bench]: compressing file {}/{}", idx + 1, file_paths.len());
+//        }
+//        let mut handle = File::open(path).unwrap();
+//        let mut data = Vec::new();
+//        handle.read_to_end(&mut data).unwrap();
+//        uncompressed_size += data.len();
+//        // Compress with no dict
+//        let nodict_output = zstd::encode_all(data.as_slice(), compression_level).unwrap();
+//        nodict_size += nodict_output.len();
+//        // Compress with the reference dict
+//        reference_encoder
+//            .write_all(data.as_slice())
+//            .expect("reference writer writing");
+//        // Compress with our dict
+//        our_encoder
+//            .write_all(data.as_slice())
+//            .expect("our writer writing");
+//    }
+//
+//    BenchmarkResults {
+//        uncompressed_size,
+//        nodict_size,
+//        reference_size: reference_output.0,
+//        our_size: our_output.0,
+//    }
+//}
diff --git a/src/bin/zstd_stream.rs b/src/bin/zstd_stream.rs
index 609530e5b..521abf464 100644
--- a/src/bin/zstd_stream.rs
+++ b/src/bin/zstd_stream.rs
@@ -19,14 +19,13 @@ fn main() {
 
     if flags.len() != 2 {
         eprintln!(
-            "No flags other than -d and -c are currently implemented. Flags used: {:?}",
-            flags
+            "No flags other than -d and -c are currently implemented. Flags used: {flags:?}"
         );
         return;
     }
 
     for path in file_paths {
-        eprintln!("File: {}", path);
+        eprintln!("File: {path}");
         let f = File::open(path).unwrap();
         let mut buf_read = std::io::BufReader::new(f);
 
diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs
index 407cc9471..9f2c94922 100644
--- a/src/dictionary/cover.rs
+++ b/src/dictionary/cover.rs
@@ -1,4 +1,4 @@
-//! An implementation of the dictionary generation algorithm
+//! An implementation of the local maximum coverage algorithm
 //! described in the paper "Effective Construction of Relative Lempel-Ziv Dictionaries",
 //! by Liao, Petri, Moffat, and Wirth, published under the University of Melbourne.
 //!
@@ -9,14 +9,13 @@
 
 use super::DictParams;
 use crate::dictionary::frequency::estimate_frequency;
-use crate::dictionary::reservoir::create_sample;
 use core::convert::TryInto;
-use std::collections::{BinaryHeap, HashMap};
-use std::io::{Cursor, Read};
+use std::collections::HashMap;
 use std::vec::Vec;
 
 /// The size of each k-mer
 pub(super) const K: usize = 16;
+
 ///As found under "4: Experiments - Varying k-mer Size" in the original paper,
 /// "when k = 16, across all our text collections, there is a reasonable spread"
 ///
@@ -67,14 +66,6 @@ pub struct Context {
     pub frequencies: HashMap<KMer, usize>,
 }
 
-impl Context {
-    fn new() -> Self {
-        Self {
-            frequencies: HashMap::new(),
-        }
-    }
-}
-
 /// Returns the highest scoring segment in an epoch
 /// as a slice of that epoch.
 pub fn pick_best_segment<'epoch>(
@@ -104,7 +95,7 @@ pub fn pick_best_segment<'epoch>(
 
 /// Given a segment, compute the score (or usefulness) of that segment against the entire epoch.
 ///
-/// `score_segment` modifies ctx.frequencies.
+/// `score_segment` modifies `ctx.frequencies`.
 fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) -> usize {
     let mut segment_score = 0;
     // Determine the score of each overlapping k-mer
diff --git a/src/dictionary/frequency.rs b/src/dictionary/frequency.rs
index 72aa03531..074e73839 100644
--- a/src/dictionary/frequency.rs
+++ b/src/dictionary/frequency.rs
@@ -5,13 +5,7 @@
 
 /// Computes a best effort guess as to how many times `pattern` occurs within
 /// `body`. While not 100% accurate, it will be accurate the vast majority of time
-pub(super) fn estimate_frequency(pattern: &[u8], body: &[u8]) -> usize {
-    //vprintln!(
-    //    "\tkarp-rabin: searching haystack of size {} for needle of size {} with ident {}",
-    //    pattern.len(),
-    //    body.len(),
-    //    pattern[0] + pattern[1]
-    //);
+pub fn estimate_frequency(pattern: &[u8], body: &[u8]) -> usize {
     assert!(body.len() >= pattern.len());
     // A prime number for modulo operations to reduce collisions (q)
     const PRIME: isize = 2654435761;
diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs
index cabdca906..bb607b4f0 100644
--- a/src/dictionary/mod.rs
+++ b/src/dictionary/mod.rs
@@ -19,13 +19,15 @@
 //    reservoir of n/t, where `t` is the desired number of occurances
 //    we want the most common k-mers to have
 //  - Have the ability to estimate
-//    the frequency of a given k-mer: f(w: k-mer) calculates
+//    the frequency of a given k-mer: `f(w: k-mer)` calculates
 //    the frequency of w in the reservoir using a rolling karp-rabin hash
 //  - The score of a segment is the sum of `f(w)` called on every kmer within the segment
 mod cover;
 mod frequency;
 mod reservoir;
 
+use crate::dictionary::reservoir::create_sample;
+use alloc::vec;
 use core::cmp::Reverse;
 use cover::*;
 use std::{
@@ -38,13 +40,10 @@ use std::{
     vec::Vec,
 };
 
-use alloc::vec;
-
-use crate::dictionary::reservoir::create_sample;
-
 /// A set of values that are used during dictionary construction.
 ///
 /// Changing these values can improve the resulting dictionary size for certain datasets.
+// TODO: move `k` here.
 pub struct DictParams {
     /// Segment size.
     ///
@@ -58,7 +57,22 @@ pub struct DictParams {
     pub segment_size: u32,
 }
 
-/// Create a dictionary
+/// Creates a dictionary, training off of every file in this directory and all
+/// sub-directories.
+///
+/// The resulting dictionary will be approxamitely `dict_size` or less, and written to `output`.
+///
+/// # Errors
+/// This function returns `Ok(())` if the dictionary was created successfully, and an
+/// `Err(io::Error)` if an error was encountered reading the input directory.
+///
+/// # Examples
+/// ```no_run
+/// // Create a roughly 1mb dictionary, training off of file in `sample_files`
+/// let input_folder = "sample_files/";
+/// let output = File::create("output.dict");
+/// ruzstd::dict::create_dict_from_dir(input_folder, &mut output, 1_000_000);
+/// ```
 pub fn create_dict_from_dir<P: AsRef<Path>, W: io::Write>(
     path: P,
     output: &mut W,
@@ -103,8 +117,8 @@ pub fn create_dict_from_dir<P: AsRef<Path>, W: io::Write>(
 ///
 /// - `source` will be used as training data for the entire dictionary.
 /// - `source_size` influences how the data is divided and sampled and is measured
-///    in bytes. While this does not need to be exact, estimates should attempt to be
-///    larger than the actual collection size.
+///   in bytes. While this does not need to be exact, estimates should attempt to be
+///   larger than the actual collection size.
 /// - `output` is where the completed dictionary will be written.
 /// - `dict_size` determines how large the complete dictionary should be. The completed
 ///   dictionary will be this size or smaller.
@@ -117,7 +131,7 @@ pub fn create_dict_from_source<R: io::Read, W: io::Write>(
     dict_size: usize,
 ) {
     vprintln!("create_dict: creating {dict_size} byte dict from {source_size} byte source");
-    let mut buffered_source = BufReader::with_capacity(5_000_000, source);
+    let mut buffered_source = BufReader::with_capacity(128_000, source);
 
     let params = DictParams { segment_size: 2048 };
     let num_segments = source_size / params.segment_size as usize;
@@ -167,6 +181,8 @@ pub fn create_dict_from_source<R: io::Read, W: io::Write>(
     // Write the dictionary with the highest scoring segment last because
     // closer items can be represented with a smaller offset
     while let Some(segment) = pool.pop() {
-        output.write(&segment.0.raw).expect("can write to output");
+        output
+            .write_all(&segment.0.raw)
+            .expect("can write to output");
     }
 }

From e17156d100cedde646ca0a7bf128d2a2a20c1576 Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Tue, 19 Aug 2025 11:00:12 -0600
Subject: [PATCH 11/16] refactor: specify raw content dictionary creation

---
 src/dictionary/mod.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs
index bb607b4f0..28b72932d 100644
--- a/src/dictionary/mod.rs
+++ b/src/dictionary/mod.rs
@@ -57,7 +57,7 @@ pub struct DictParams {
     pub segment_size: u32,
 }
 
-/// Creates a dictionary, training off of every file in this directory and all
+/// Creates a "raw content" dictionary, training off of every file in this directory and all
 /// sub-directories.
 ///
 /// The resulting dictionary will be approxamitely `dict_size` or less, and written to `output`.
@@ -73,7 +73,7 @@ pub struct DictParams {
 /// let output = File::create("output.dict");
 /// ruzstd::dict::create_dict_from_dir(input_folder, &mut output, 1_000_000);
 /// ```
-pub fn create_dict_from_dir<P: AsRef<Path>, W: io::Write>(
+pub fn create_raw_dict_from_dir<P: AsRef<Path>, W: io::Write>(
     path: P,
     output: &mut W,
     dict_size: usize,
@@ -108,12 +108,12 @@ pub fn create_dict_from_dir<P: AsRef<Path>, W: io::Write>(
         .fold(empty_reader, |acc, reader| Box::new(acc.chain(reader)));
 
     // Create a dict using the new reader
-    create_dict_from_source(chained_files, total_file_len as usize, output, dict_size);
+    create_raw_dict_from_source(chained_files, total_file_len as usize, output, dict_size);
     Ok(())
 }
 
-/// Read from `source` to create a dictionary of `dict_size`. The completed dictionary is written
-/// to `output`.
+/// Read from `source` to create a "raw content" dictionary of `dict_size`.
+/// The completed dictionary is written to `output`.
 ///
 /// - `source` will be used as training data for the entire dictionary.
 /// - `source_size` influences how the data is divided and sampled and is measured
@@ -124,7 +124,7 @@ pub fn create_dict_from_dir<P: AsRef<Path>, W: io::Write>(
 ///   dictionary will be this size or smaller.
 ///
 /// This function uses `BufRead` internally, the provided reader need not be buffered.
-pub fn create_dict_from_source<R: io::Read, W: io::Write>(
+pub fn create_raw_dict_from_source<R: io::Read, W: io::Write>(
     source: R,
     source_size: usize,
     output: &mut W,

From 5213ef79b4b7165e7454703368aabc4ef1969bc6 Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Tue, 19 Aug 2025 11:11:02 -0600
Subject: [PATCH 12/16] lint: fixing clippy

---
 src/bin/zstd.rs                | 86 +++++++++++++++++-----------------
 src/bin/zstd_dict.rs           |  6 +--
 src/bit_io/bit_reader.rs       |  6 +--
 src/encoding/levels/default.rs | 27 -----------
 src/encoding/levels/mod.rs     |  2 -
 5 files changed, 47 insertions(+), 80 deletions(-)
 delete mode 100644 src/encoding/levels/default.rs

diff --git a/src/bin/zstd.rs b/src/bin/zstd.rs
index b55318036..cc9762afb 100644
--- a/src/bin/zstd.rs
+++ b/src/bin/zstd.rs
@@ -1,13 +1,16 @@
 extern crate ruzstd;
 use std::fs::File;
+use std::io::BufReader;
 use std::io::Read;
 use std::io::Seek;
 use std::io::SeekFrom;
 use std::io::Write;
+use std::time::Instant;
 
 use ruzstd::decoding::errors::FrameDecoderError;
 use ruzstd::decoding::errors::ReadFrameHeaderError;
-use ruzstd::dictionary::create_dict_from_source;
+use ruzstd::encoding::CompressionLevel;
+use ruzstd::encoding::FrameCompressor;
 
 struct StateTracker {
     bytes_used: u64,
@@ -150,49 +153,44 @@ impl<R: Read> Read for PercentPrintReader<R> {
 }
 
 fn main() {
-    let input = File::open("ik9").expect("open input file");
-    //let input = File::open("local_corpus_files/enwik9").expect("open input file");
-    let input_len = input.metadata().unwrap().len() as usize;
-    let mut output = File::create("output.dict").expect("create output file");
-    create_dict_from_source(input, input_len, &mut output, 5_000_000);
-    //let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect();
-    //let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect();
-    //file_paths.remove(0);
-    //
-    //if flags.is_empty() {
-    //    let mut encoder = FrameCompressor::new(CompressionLevel::Fastest);
-    //    encoder.set_drain(Vec::new());
-    //
-    //    for path in file_paths {
-    //        let start_instant = Instant::now();
-    //        let file = std::fs::File::open(&path).unwrap();
-    //        let input_len = file.metadata().unwrap().len() as usize;
-    //        let file = PercentPrintReader {
-    //            reader: BufReader::new(file),
-    //            total: input_len,
-    //            counter: 0,
-    //            last_percent: 0,
-    //        };
-    //        encoder.set_source(file);
-    //        encoder.compress();
-    //        let mut output: Vec<_> = encoder.take_drain().unwrap();
-    //        println!(
-    //            "Compressed {path:} from {} to {} ({}%) took {}ms",
-    //            input_len,
-    //            output.len(),
-    //            if input_len == 0 {
-    //                0
-    //            } else {
-    //                output.len() * 100 / input_len
-    //            },
-    //            start_instant.elapsed().as_millis()
-    //        );
-    //        output.clear();
-    //        encoder.set_drain(output);
-    //    }
-    //} else {
-    //    decompress(&flags, &file_paths);
-    //}
+    let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect();
+    let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect();
+    file_paths.remove(0);
+    
+    if flags.is_empty() {
+       let mut encoder = FrameCompressor::new(CompressionLevel::Fastest);
+       encoder.set_drain(Vec::new());
+    
+       for path in file_paths {
+           let start_instant = Instant::now();
+           let file = std::fs::File::open(&path).unwrap();
+           let input_len = file.metadata().unwrap().len() as usize;
+           let file = PercentPrintReader {
+               reader: BufReader::new(file),
+               total: input_len,
+               counter: 0,
+               last_percent: 0,
+           };
+           encoder.set_source(file);
+           encoder.compress();
+           let mut output: Vec<_> = encoder.take_drain().unwrap();
+           println!(
+               "Compressed {path:} from {} to {} ({}%) took {}ms",
+               input_len,
+               output.len(),
+               if input_len == 0 {
+                   0
+               } else {
+                   output.len() * 100 / input_len
+               },
+               start_instant.elapsed().as_millis()
+           );
+           output.clear();
+           encoder.set_drain(output);
+       }
+    } else {
+       decompress(&flags, &file_paths);
+    }
 }
 
 fn do_something(data: &[u8], s: &mut StateTracker) {
diff --git a/src/bin/zstd_dict.rs b/src/bin/zstd_dict.rs
index f502700e7..25f24269b 100644
--- a/src/bin/zstd_dict.rs
+++ b/src/bin/zstd_dict.rs
@@ -1,4 +1,4 @@
-use ruzstd::dictionary::{create_dict_from_dir, create_dict_from_source};
+use ruzstd::dictionary::{create_raw_dict_from_dir, create_raw_dict_from_source};
 use std::fs::File;
 use std::path::Path;
 use std::env::args;
@@ -17,9 +17,9 @@ fn main() {
     if input_path.is_file() {
         let source = File::open(input_path).expect("unable to open input path");
         let source_size = source.metadata().unwrap().len();
-        create_dict_from_source(source, source_size as usize, &mut output, dict_size);
+        create_raw_dict_from_source(source, source_size as usize, &mut output, dict_size);
     } else {
-        create_dict_from_dir(input_path, &mut output, dict_size).unwrap();
+        create_raw_dict_from_dir(input_path, &mut output, dict_size).unwrap();
     }
 }
 
diff --git a/src/bit_io/bit_reader.rs b/src/bit_io/bit_reader.rs
index 4e88948a0..c8987250e 100644
--- a/src/bit_io/bit_reader.rs
+++ b/src/bit_io/bit_reader.rs
@@ -116,8 +116,7 @@ impl core::fmt::Display for GetBitsError {
             } => {
                 write!(
                     f,
-                    "Cant serve this request. The reader is limited to {} bits, requested {} bits",
-                    limit, num_requested_bits,
+                    "Cant serve this request. The reader is limited to {limit} bits, requested {num_requested_bits} bits"
                 )
             }
             GetBitsError::NotEnoughRemainingBits {
@@ -126,8 +125,7 @@ impl core::fmt::Display for GetBitsError {
             } => {
                 write!(
                     f,
-                    "Can\'t read {} bits, only have {} bits left",
-                    requested, remaining,
+                    "Can\'t read {requested} bits, only have {remaining} bits left"
                 )
             }
         }
diff --git a/src/encoding/levels/default.rs b/src/encoding/levels/default.rs
deleted file mode 100644
index 4b83bd246..000000000
--- a/src/encoding/levels/default.rs
+++ /dev/null
@@ -1,27 +0,0 @@
-use crate::{
-    common::MAX_BLOCK_SIZE,
-    encoding::{
-        block_header::BlockHeader, blocks::compress_block, frame_compressor::CompressState, Matcher,
-    },
-};
-use alloc::vec::Vec;
-
-/// Compresses a single block at [`crate::encoding::CompressionLevel::Default`].
-///
-/// # Parameters
-/// - `state`: [`CompressState`] so the compressor can refer to data prior to
-///   the start of this block
-/// - `last_block`: Whether or not this block is going to be the last block in the frame
-///   (needed because this info is written into the block header)
-/// - `uncompressed_data`: A block's worth of uncompressed data, taken from the
-///   larger input
-/// - `output`: As `uncompressed_data` is compressed, it's appended to `output`.
-#[inline]
-pub fn compress_default<M: Matcher>(
-    state: &mut CompressState<M>,
-    last_block: bool,
-    uncompressed_data: Vec<u8>,
-    output: &mut Vec<u8>,
-) {
-    let block_size = uncompressed_data.len() as u32;
-}
diff --git a/src/encoding/levels/mod.rs b/src/encoding/levels/mod.rs
index ce6f66bd8..fb39caaf8 100644
--- a/src/encoding/levels/mod.rs
+++ b/src/encoding/levels/mod.rs
@@ -1,4 +1,2 @@
 mod fastest;
 pub use fastest::compress_fastest;
-mod default;
-pub use default::compress_default;

From a710b220422bc65dcf4f1a7f92fed1adbef3767b Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Tue, 19 Aug 2025 11:51:53 -0600
Subject: [PATCH 13/16] docs: update readme.md to include dict builder

---
 .gitignore                  |  1 +
 Cargo.toml                  |  5 +--
 Readme.md                   | 37 +++++++++++++++++---
 src/bin/zstd.rs             | 68 ++++++++++++++++++-------------------
 src/bin/zstd_dict.rs        |  2 +-
 src/bin/zstd_stream.rs      |  4 +--
 src/dictionary/cover.rs     | 18 ++++------
 src/dictionary/mod.rs       |  6 ++--
 src/dictionary/reservoir.rs |  5 ++-
 src/lib.rs                  |  1 +
 src/tests/decode_corpus.rs  |  3 +-
 src/tests/dict_test.rs      |  5 +--
 src/tests/mod.rs            | 13 ++++---
 13 files changed, 97 insertions(+), 71 deletions(-)

diff --git a/.gitignore b/.gitignore
index 118714d18..6cb4d5724 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 **/*.rs.bk
 Cargo.lock
 /local_corpus_files
+/local_dict_corpus_files
 /orig-zstd
 fuzz_decodecorpus
 perf.data*
diff --git a/Cargo.toml b/Cargo.toml
index d66f4490f..996e2478a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,7 @@ categories = ["compression"]
 [dependencies]
 twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"], optional = true }
 
-# Internal feature, only used when building as part of libstd, not part of the
+# Internal feature, only used when building as part of libstd, not part of theea
 # stable interface of this crate.
 compiler_builtins = { version = "0.1.2", optional = true }
 core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" }
@@ -33,6 +33,7 @@ default = ["hash", "std"]
 hash = ["dep:twox-hash"]
 fuzz_exports = []
 std = []
+dict_builder = ["std"]
 
 # Internal feature, only used when building as part of libstd, not part of the
 # stable interface of this crate.
@@ -52,4 +53,4 @@ required-features = ["std"]
 
 [[bin]]
 name = "zstd_dict"
-required-features = ["std"]
+required-features = ["std", "dict_builder"]
diff --git a/Readme.md b/Readme.md
index 3281a3d02..c96e787b4 100644
--- a/Readme.md
+++ b/Readme.md
@@ -15,8 +15,20 @@ This crate is currently actively maintained.
 
 # Current Status
 
-Feature complete on the decoder side.
+## Decompression
+The `decoding` module provides a complete
+implementation of a Zstandard decompressor.
+
+In terms of speed, `ruzstd` is behind the original C implementation
+which has a rust binding located [here](https://github.com/gyscos/zstd-rs).
 
+Measuring with the 'time' utility the original zstd and my decoder both
+decoding the same enwik9.zst file from a ramfs, my decoder is about 3.5
+times slower. Enwik9 is highly compressible, for less compressible data
+(like a ubuntu installation .iso) my decoder comes close to only being
+1.4 times slower.
+
+## Compression
 On the compression side:
 - Support for generating compressed blocks at any compression level
   - [x] Uncompressed
@@ -24,13 +36,28 @@ On the compression side:
   - [ ] Default (roughly level 3)
   - [ ] Better (roughly level 7)
   - [ ] Best (roughly level 11)
-- [ ] Checksums
+- [x] Checksums
 - [ ] Dictionaries
 
-## Speed
-In terms of speed this library is behind the original C implementation which has a rust binding located [here](https://github.com/gyscos/zstd-rs).
+## Dictionary Generation
+When the `dict_builder` feature is enabled, the `dictionary` module
+provides the ability to create new dictionaries. 
+
+On the `github-users` sample set, our implementation benchmarks within
+0.2% of the official implementation (as of commit 
+`09e52d07340acdb2e13817b066e8be6e424f7258`):
+```
+uncompressed: 100.00% (7484607 bytes)
+no dict: 34.99% of original size (2618872 bytes)
+reference dict: 16.16% of no dict size (2195672 bytes smaller)
+our dict: 16.28% of no dict size (2192400 bytes smaller)
+```
+
+The dictionary generator only provides support for creating "raw
+content" dictionaries. Tagged dictionaries are currently unsupported.
 
-Measuring with the 'time' utility the original zstd and my decoder both decoding the same enwik9.zst file from a ramfs, my decoder is about 3.5 times slower. Enwik9 is highly compressible, for less compressible data (like a ubuntu installation .iso) my decoder comes close to only being 1.4 times slower.
+See <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format>
+for clarification.
 
 
 # How can you use it?
diff --git a/src/bin/zstd.rs b/src/bin/zstd.rs
index cc9762afb..b90dcb690 100644
--- a/src/bin/zstd.rs
+++ b/src/bin/zstd.rs
@@ -34,9 +34,7 @@ fn decompress(flags: &[String], file_paths: &[String]) {
     }
 
     if flags.len() != 2 {
-        eprintln!(
-            "No flags other than -d and -c are currently implemented. Flags used: {flags:?}"
-        );
+        eprintln!("No flags other than -d and -c are currently implemented. Flags used: {flags:?}");
         return;
     }
 
@@ -156,40 +154,40 @@ fn main() {
     let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect();
     let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect();
     file_paths.remove(0);
-    
+
     if flags.is_empty() {
-       let mut encoder = FrameCompressor::new(CompressionLevel::Fastest);
-       encoder.set_drain(Vec::new());
-    
-       for path in file_paths {
-           let start_instant = Instant::now();
-           let file = std::fs::File::open(&path).unwrap();
-           let input_len = file.metadata().unwrap().len() as usize;
-           let file = PercentPrintReader {
-               reader: BufReader::new(file),
-               total: input_len,
-               counter: 0,
-               last_percent: 0,
-           };
-           encoder.set_source(file);
-           encoder.compress();
-           let mut output: Vec<_> = encoder.take_drain().unwrap();
-           println!(
-               "Compressed {path:} from {} to {} ({}%) took {}ms",
-               input_len,
-               output.len(),
-               if input_len == 0 {
-                   0
-               } else {
-                   output.len() * 100 / input_len
-               },
-               start_instant.elapsed().as_millis()
-           );
-           output.clear();
-           encoder.set_drain(output);
-       }
+        let mut encoder = FrameCompressor::new(CompressionLevel::Fastest);
+        encoder.set_drain(Vec::new());
+
+        for path in file_paths {
+            let start_instant = Instant::now();
+            let file = std::fs::File::open(&path).unwrap();
+            let input_len = file.metadata().unwrap().len() as usize;
+            let file = PercentPrintReader {
+                reader: BufReader::new(file),
+                total: input_len,
+                counter: 0,
+                last_percent: 0,
+            };
+            encoder.set_source(file);
+            encoder.compress();
+            let mut output: Vec<_> = encoder.take_drain().unwrap();
+            println!(
+                "Compressed {path:} from {} to {} ({}%) took {}ms",
+                input_len,
+                output.len(),
+                if input_len == 0 {
+                    0
+                } else {
+                    output.len() * 100 / input_len
+                },
+                start_instant.elapsed().as_millis()
+            );
+            output.clear();
+            encoder.set_drain(output);
+        }
     } else {
-       decompress(&flags, &file_paths);
+        decompress(&flags, &file_paths);
     }
 }
 
diff --git a/src/bin/zstd_dict.rs b/src/bin/zstd_dict.rs
index 25f24269b..6ec26c92e 100644
--- a/src/bin/zstd_dict.rs
+++ b/src/bin/zstd_dict.rs
@@ -1,7 +1,7 @@
 use ruzstd::dictionary::{create_raw_dict_from_dir, create_raw_dict_from_source};
+use std::env::args;
 use std::fs::File;
 use std::path::Path;
-use std::env::args;
 
 fn main() {
     let args: Vec<String> = args().collect();
diff --git a/src/bin/zstd_stream.rs b/src/bin/zstd_stream.rs
index 521abf464..d22bac8c4 100644
--- a/src/bin/zstd_stream.rs
+++ b/src/bin/zstd_stream.rs
@@ -18,9 +18,7 @@ fn main() {
     }
 
     if flags.len() != 2 {
-        eprintln!(
-            "No flags other than -d and -c are currently implemented. Flags used: {flags:?}"
-        );
+        eprintln!("No flags other than -d and -c are currently implemented. Flags used: {flags:?}");
         return;
     }
 
diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs
index 9f2c94922..093b8b656 100644
--- a/src/dictionary/cover.rs
+++ b/src/dictionary/cover.rs
@@ -2,10 +2,10 @@
 //! described in the paper "Effective Construction of Relative Lempel-Ziv Dictionaries",
 //! by Liao, Petri, Moffat, and Wirth, published under the University of Melbourne.
 //!
-//! See: https://people.eng.unimelb.edu.au/ammoffat/abstracts/lpmw16www.pdf
+//! See: <https://people.eng.unimelb.edu.au/ammoffat/abstracts/lpmw16www.pdf>
 //!
 //! Facebook's implementation was also used as a reference.
-//! https://github.com/facebook/zstd/tree/dev/lib/dictBuilder
+//! <https://github.com/facebook/zstd/tree/dev/lib/dictBuilder>
 
 use super::DictParams;
 use crate::dictionary::frequency::estimate_frequency;
@@ -43,11 +43,7 @@ impl PartialEq for Segment {
 
 impl PartialOrd for Segment {
     fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
-        match self.score.partial_cmp(&other.score) {
-            Some(core::cmp::Ordering::Equal) => {}
-            ord => return ord,
-        }
-        self.score.partial_cmp(&other.score)
+        Some(self.cmp(other))
     }
 }
 
@@ -68,15 +64,15 @@ pub struct Context {
 
 /// Returns the highest scoring segment in an epoch
 /// as a slice of that epoch.
-pub fn pick_best_segment<'epoch>(
+pub fn pick_best_segment(
     params: &DictParams,
     ctx: &mut Context,
-    collection_sample: &'epoch [u8],
+    collection_sample: &'_ [u8],
 ) -> Segment {
     let mut segments = collection_sample
         .chunks(params.segment_size as usize)
         .peekable();
-    let mut best_segment: &[u8] = &segments.peek().expect("at least one segment");
+    let mut best_segment: &[u8] = segments.peek().expect("at least one segment");
     let mut top_segment_score: usize = 0;
     // Iterate over segments and score each segment, keeping track of the best segment
     for segment in segments {
@@ -107,7 +103,7 @@ fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) ->
         if ctx.frequencies.contains_key(kmer) {
             continue;
         }
-        let kmer_score = estimate_frequency(kmer, &collection_sample);
+        let kmer_score = estimate_frequency(kmer, collection_sample);
         ctx.frequencies.insert(*kmer, kmer_score);
         segment_score += kmer_score;
     }
diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs
index 28b72932d..f55eff608 100644
--- a/src/dictionary/mod.rs
+++ b/src/dictionary/mod.rs
@@ -50,8 +50,8 @@ pub struct DictParams {
     /// As found under "4. Experiments - Varying Segment Size" in the original paper, a
     /// segment size of 2 kiB was effective.
     ///
-    /// "We explored a range of [segment_size] values and found the performance of LMC is insensitive
-    /// to [segment_size]. We fix [segment_size] to 2kiB
+    /// "We explored a range of \[`segment_size`\] values and found the performance of LMC is insensitive
+    /// to \[`segment_size`\]. We fix \[`segment_size`\] to 2kiB
     ///
     /// Reasonable range: [16, 2048+]
     pub segment_size: u32,
@@ -85,7 +85,7 @@ pub fn create_raw_dict_from_dir<P: AsRef<Path>, W: io::Write>(
         for entry in dir {
             let entry = entry?;
             if entry.file_type()?.is_dir() {
-                recurse_read(fs::read_dir(&entry.path())?, file_paths)?;
+                recurse_read(fs::read_dir(entry.path())?, file_paths)?;
             } else {
                 file_paths.push(entry.path());
             }
diff --git a/src/dictionary/reservoir.rs b/src/dictionary/reservoir.rs
index 041ebc498..6fb318c91 100644
--- a/src/dictionary/reservoir.rs
+++ b/src/dictionary/reservoir.rs
@@ -2,7 +2,7 @@ use super::cover::K;
 use alloc::vec::Vec;
 use core::f64::consts::E;
 use fastrand;
-use std::io;
+use std::{io, vec};
 
 /// Creates a representative sample of `input` of `size` bytes.
 pub fn create_sample<R: io::Read>(input: &mut R, size: usize) -> Vec<u8> {
@@ -31,8 +31,7 @@ impl Reservoir {
     /// Initialize a new empty reservoir, creating an allocation of `size`.
     pub fn new(size: usize) -> Self {
         assert!(size >= 16, "Reservoirs cannot be below 16 bytes in size");
-        let mut lake = Vec::with_capacity(size);
-        lake.resize(size, 0);
+        let lake: Vec<u8> = vec![0; size];
         let k = K as u16;
         Self { lake, k }
     }
diff --git a/src/lib.rs b/src/lib.rs
index 49366d80d..62fd4b5a6 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -35,6 +35,7 @@ macro_rules! vprintln {
 mod bit_io;
 mod common;
 pub mod decoding;
+#[cfg(feature = "dict_builder")]
 pub mod dictionary;
 pub mod encoding;
 
diff --git a/src/tests/decode_corpus.rs b/src/tests/decode_corpus.rs
index 69844e45e..369b59299 100644
--- a/src/tests/decode_corpus.rs
+++ b/src/tests/decode_corpus.rs
@@ -7,6 +7,7 @@ fn test_decode_corpus_files() {
     use alloc::string::{String, ToString};
     use alloc::vec::Vec;
     use std::fs;
+    use std::io::BufReader;
     use std::io::Read;
     use std::println;
 
@@ -82,7 +83,7 @@ fn test_decode_corpus_files() {
 
         let mut original_p = p.clone();
         original_p.truncate(original_p.len() - 4);
-        let original_f = fs::File::open(original_p).unwrap();
+        let original_f = BufReader::new(fs::File::open(original_p).unwrap());
         let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
 
         println!("Results for file: {}", p.clone());
diff --git a/src/tests/dict_test.rs b/src/tests/dict_test.rs
index 516b1782e..60e4a90d7 100644
--- a/src/tests/dict_test.rs
+++ b/src/tests/dict_test.rs
@@ -83,6 +83,7 @@ fn test_dict_decoding() {
     use alloc::string::{String, ToString};
     use alloc::vec::Vec;
     use std::fs;
+    use std::io::BufReader;
     use std::io::Read;
     use std::println;
 
@@ -97,7 +98,7 @@ fn test_dict_decoding() {
     let mut speeds_read = Vec::new();
 
     let mut files: Vec<_> = fs::read_dir("./dict_tests/files").unwrap().collect();
-    let dict = fs::File::open("./dict_tests/dictionary").unwrap();
+    let dict = BufReader::new(fs::File::open("./dict_tests/dictionary").unwrap());
     let dict: Vec<u8> = dict.bytes().map(|x| x.unwrap()).collect();
 
     files.sort_by_key(|x| match x {
@@ -155,7 +156,7 @@ fn test_dict_decoding() {
 
         let mut original_p = p.clone();
         original_p.truncate(original_p.len() - 4);
-        let original_f = fs::File::open(original_p).unwrap();
+        let original_f = BufReader::new(fs::File::open(original_p).unwrap());
         let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
 
         println!("Results for file: {}", p.clone());
diff --git a/src/tests/mod.rs b/src/tests/mod.rs
index 52fda6ddd..3a47122a5 100644
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@@ -130,8 +130,9 @@ fn test_frame_decoder() {
 fn test_decode_from_to() {
     use crate::decoding::FrameDecoder;
     use std::fs::File;
+    use std::io::BufReader;
     use std::io::Read;
-    let f = File::open("./decodecorpus_files/z000088.zst").unwrap();
+    let f = BufReader::new(File::open("./decodecorpus_files/z000088.zst").unwrap());
     let mut frame_dec = FrameDecoder::new();
 
     let content: Vec<u8> = f.bytes().map(|x| x.unwrap()).collect();
@@ -197,7 +198,7 @@ fn test_decode_from_to() {
         None => std::println!("No checksums to test\n"),
     }
 
-    let original_f = File::open("./decodecorpus_files/z000088").unwrap();
+    let original_f = BufReader::new(File::open("./decodecorpus_files/z000088").unwrap());
     let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
 
     if original.len() != result.len() {
@@ -233,6 +234,7 @@ fn test_specific_file() {
     use crate::decoding::BlockDecodingStrategy;
     use crate::decoding::FrameDecoder;
     use std::fs;
+    use std::io::BufReader;
     use std::io::Read;
 
     let path = "./decodecorpus_files/z000068.zst";
@@ -256,7 +258,7 @@ fn test_specific_file() {
         .unwrap();
     let result = frame_dec.collect().unwrap();
 
-    let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap();
+    let original_f = BufReader::new(fs::File::open("./decodecorpus_files/z000088").unwrap());
     let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
 
     std::println!("Results for file: {}", path);
@@ -293,6 +295,7 @@ fn test_specific_file() {
 #[cfg(feature = "std")]
 fn test_streaming() {
     use std::fs;
+    use std::io::BufReader;
     use std::io::Read;
 
     let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap();
@@ -301,7 +304,7 @@ fn test_streaming() {
     let mut result = Vec::new();
     Read::read_to_end(&mut stream, &mut result).unwrap();
 
-    let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap();
+    let original_f = BufReader::new(fs::File::open("./decodecorpus_files/z000088").unwrap());
     let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
 
     if original.len() != result.len() {
@@ -343,7 +346,7 @@ fn test_streaming() {
     let mut result = Vec::new();
     Read::read_to_end(&mut stream, &mut result).unwrap();
 
-    let original_f = fs::File::open("./decodecorpus_files/z000068").unwrap();
+    let original_f = BufReader::new(fs::File::open("./decodecorpus_files/z000068").unwrap());
     let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
 
     std::println!("Results for file:");

From ff3d5a7f6943488a003e3e64e61de59d93d60766 Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Tue, 19 Aug 2025 16:02:37 -0600
Subject: [PATCH 14/16] docs: include some rustdoc metadata

---
 Cargo.toml            | 4 ++++
 src/dictionary/mod.rs | 2 +-
 src/lib.rs            | 5 +++--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 7fabc8906..64ec9f513 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,6 +12,10 @@ readme = "Readme.md"
 keywords = ["zstd", "zstandard", "decompression"]
 categories = ["compression"]
 
+[package.metadata.docs.rs]
+all-features = true
+rustdoc-args = ["--cfg", "docsrs"]
+
 [dependencies]
 twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"], optional = true }
 
diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs
index f55eff608..48fdbcb99 100644
--- a/src/dictionary/mod.rs
+++ b/src/dictionary/mod.rs
@@ -44,7 +44,7 @@ use std::{
 ///
 /// Changing these values can improve the resulting dictionary size for certain datasets.
 // TODO: move `k` here.
-pub struct DictParams {
+pub(super) struct DictParams {
     /// Segment size.
     ///
     /// As found under "4. Experiments - Varying Segment Size" in the original paper, a
diff --git a/src/lib.rs b/src/lib.rs
index 62fd4b5a6..7f3106eca 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -7,8 +7,8 @@
 //!
 //! ## Compression
 //! The [encoding] module contains the code for compression.
-//! Decompression can be achieved by using the [`encoding::compress`]/[`encoding::compress_to_vec`]
-//! functions or the [`encoding::FrameCompressor`]
+//! Compression can be achieved by using the [`encoding::compress`]/[`encoding::compress_to_vec`]
+//! functions or [`encoding::FrameCompressor`]
 //!
 #![doc = include_str!("../Readme.md")]
 #![no_std]
@@ -36,6 +36,7 @@ mod bit_io;
 mod common;
 pub mod decoding;
 #[cfg(feature = "dict_builder")]
+#[cfg_attr(docsrs, doc(cfg(feature = "dict_builder")))]
 pub mod dictionary;
 pub mod encoding;
 

From 38c7c8c89261944d34395977fd46c03ae118b5a6 Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Tue, 19 Aug 2025 16:15:43 -0600
Subject: [PATCH 15/16] lint: fixing clippy

---
 Readme.md                  |  2 +-
 src/bit_io/bit_reader.rs   |  2 +-
 src/bit_io/bit_writer.rs   | 10 +++++-----
 src/dictionary/mod.rs      |  5 +++--
 src/fse/fse_decoder.rs     |  2 +-
 src/huff0/huff0_decoder.rs |  2 +-
 6 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/Readme.md b/Readme.md
index c96e787b4..79d2ff943 100644
--- a/Readme.md
+++ b/Readme.md
@@ -46,7 +46,7 @@ provides the ability to create new dictionaries.
 On the `github-users` sample set, our implementation benchmarks within
 0.2% of the official implementation (as of commit 
 `09e52d07340acdb2e13817b066e8be6e424f7258`):
-```
+```no_build
 uncompressed: 100.00% (7484607 bytes)
 no dict: 34.99% of original size (2618872 bytes)
 reference dict: 16.16% of no dict size (2195672 bytes smaller)
diff --git a/src/bit_io/bit_reader.rs b/src/bit_io/bit_reader.rs
index c8987250e..2140ddb3b 100644
--- a/src/bit_io/bit_reader.rs
+++ b/src/bit_io/bit_reader.rs
@@ -66,7 +66,7 @@ impl<'s> BitReader<'s> {
 
             let mut bit_shift = bits_left_in_current_byte; //this many bits are already set in value
 
-            assert!(self.idx % 8 == 0);
+            assert!(self.idx.is_multiple_of(8));
 
             //collect full bytes
             for _ in 0..full_bytes_needed {
diff --git a/src/bit_io/bit_writer.rs b/src/bit_io/bit_writer.rs
index fb809926c..7ce228a54 100644
--- a/src/bit_io/bit_writer.rs
+++ b/src/bit_io/bit_writer.rs
@@ -45,7 +45,7 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {
 
     /// Reset to an index. Currently only supports resetting to a byte aligned index
     pub fn reset_to(&mut self, index: usize) {
-        assert!(index % 8 == 0);
+        assert!(index.is_multiple_of(8));
         self.partial = 0;
         self.bits_in_partial = 0;
         self.bit_idx = index;
@@ -66,7 +66,7 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {
 
         // We might be changing bits unaligned to byte borders.
         // This means the lower bits of the first byte we are touching must stay the same
-        if idx % 8 != 0 {
+        if !idx.is_multiple_of(8) {
             // How many (upper) bits will change in the first byte?
             let bits_in_first_byte = 8 - (idx % 8);
             // We don't support only changing a few bits in the middle of a byte
@@ -82,7 +82,7 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {
             idx += bits_in_first_byte;
         }
 
-        assert!(idx % 8 == 0);
+        assert!(idx.is_multiple_of(8));
         // We are now byte aligned, change idx to byte resolution
         let mut idx = idx / 8;
 
@@ -113,7 +113,7 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {
 
     /// Flush temporary internal buffers to the output buffer. Only works if this is currently byte aligned
     pub fn flush(&mut self) {
-        assert!(self.bits_in_partial % 8 == 0);
+        assert!(self.bits_in_partial.is_multiple_of(8));
         let full_bytes = self.bits_in_partial / 8;
         self.output
             .as_mut()
@@ -204,7 +204,7 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {
     /// Returns how many bits are missing for an even byte
     pub fn misaligned(&self) -> usize {
         let idx = self.index();
-        if idx % 8 == 0 {
+        if idx.is_multiple_of(8) {
             0
         } else {
             8 - (idx % 8)
diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs
index 48fdbcb99..322f68c90 100644
--- a/src/dictionary/mod.rs
+++ b/src/dictionary/mod.rs
@@ -68,10 +68,11 @@ pub(super) struct DictParams {
 ///
 /// # Examples
 /// ```no_run
+/// use std::fs::File;
 /// // Create a roughly 1mb dictionary, training off of file in `sample_files`
 /// let input_folder = "sample_files/";
-/// let output = File::create("output.dict");
-/// ruzstd::dict::create_dict_from_dir(input_folder, &mut output, 1_000_000);
+/// let mut output = File::create("output.dict").unwrap();
+/// ruzstd::dictionary::create_raw_dict_from_dir(input_folder, &mut output, 1_000_000);
 /// ```
 pub fn create_raw_dict_from_dir<P: AsRef<Path>, W: io::Write>(
     path: P,
diff --git a/src/fse/fse_decoder.rs b/src/fse/fse_decoder.rs
index bf573c1b0..7cd59dc6d 100644
--- a/src/fse/fse_decoder.rs
+++ b/src/fse/fse_decoder.rs
@@ -297,7 +297,7 @@ impl FSETable {
             });
         }
 
-        let bytes_read = if br.bits_read() % 8 == 0 {
+        let bytes_read = if br.bits_read().is_multiple_of(8) {
             br.bits_read() / 8
         } else {
             (br.bits_read() / 8) + 1
diff --git a/src/huff0/huff0_decoder.rs b/src/huff0/huff0_decoder.rs
index 5c3e98bf0..1952aea3c 100644
--- a/src/huff0/huff0_decoder.rs
+++ b/src/huff0/huff0_decoder.rs
@@ -245,7 +245,7 @@ impl HuffmanTable {
                 let num_weights = header - 127;
                 self.weights.resize(num_weights as usize, 0);
 
-                let bytes_needed = if num_weights % 2 == 0 {
+                let bytes_needed = if num_weights.is_multiple_of(2) {
                     num_weights as usize / 2
                 } else {
                     (num_weights as usize / 2) + 1

From a598241c977cf3bd35edebe05763f2bbda5195dc Mon Sep 17 00:00:00 2001
From: arc <zleyyij@users.noreply.github.com>
Date: Thu, 21 Aug 2025 06:43:38 -0600
Subject: [PATCH 16/16] pr(cleanup): apply feedback from pull/91

- Fix typo in cargo.toml
- set VERBOSE to false and add a test to verify it's false
- remove commented out bench code from zstd_dict.rs
---
 Cargo.toml           |   2 +-
 src/bin/zstd_dict.rs | 116 -------------------------------------------
 src/lib.rs           |   6 +--
 src/tests/mod.rs     |   7 +++
 4 files changed, 11 insertions(+), 120 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 64ec9f513..5a7cec4a8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ rustdoc-args = ["--cfg", "docsrs"]
 [dependencies]
 twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"], optional = true }
 
-# Internal feature, only used when building as part of libstd, not part of theea
+# Internal feature, only used when building as part of libstd, not part of the
 # stable interface of this crate.
 compiler_builtins = { version = "0.1.2", optional = true }
 core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" }
diff --git a/src/bin/zstd_dict.rs b/src/bin/zstd_dict.rs
index 6ec26c92e..54a4d2651 100644
--- a/src/bin/zstd_dict.rs
+++ b/src/bin/zstd_dict.rs
@@ -22,119 +22,3 @@ fn main() {
         create_raw_dict_from_dir(input_path, &mut output, dict_size).unwrap();
     }
 }
-
-//struct BenchmarkResults {
-//    pub uncompressed_size: usize,
-//    pub nodict_size: usize,
-//    pub reference_size: usize,
-//    pub our_size: usize,
-//}
-//
-//impl Display for BenchmarkResults {
-//    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-//        writeln!(f, "uncompressed: 100.00% ({})", self.uncompressed_size)?;
-//        writeln!(
-//            f,
-//            "no dict: {:.2}% of original size ({})",
-//            f64::from(self.nodict_size as u32) / f64::from(self.uncompressed_size as u32) * 100.0,
-//            self.nodict_size
-//        )?;
-//        writeln!(
-//            f,
-//            "reference dict: {:.2}% of no dict size ({} bytes smaller)",
-//            f64::from(self.reference_size as u32) / f64::from(self.nodict_size as u32) * 100.0,
-//            self.nodict_size - self.reference_size
-//        )?;
-//        write!(
-//            f,
-//            "our dict: {:.2}% of no dict size ({} bytes smaller)",
-//            f64::from(self.our_size as u32) / f64::from(self.nodict_size as u32) * 100.0,
-//            self.nodict_size - self.our_size
-//        )?;
-//        Ok(())
-//    }
-//}
-//
-//struct Dumpster(pub usize);
-//
-//impl Write for Dumpster {
-//    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
-//        self.0 += buf.len();
-//        Ok(buf.len())
-//    }
-//
-//    fn flush(&mut self) -> io::Result<()> {
-//        Ok(())
-//    }
-//}
-//
-///// Compares compression ratios achieved with a dictionary
-//#[allow(unused)]
-//fn bench<P: AsRef<Path>>(input_path: P) -> BenchmarkResults {
-//    // At what compression level the dicts are built with
-//    let compression_level = 1;
-//    // 1. Collect a list of a path to every file in the directory into `file_paths`
-//    println!("[bench]: collecting list of input files");
-//    let mut file_paths: Vec<PathBuf> = Vec::new();
-//    let dir: fs::ReadDir = fs::read_dir(&input_path).expect("read input path");
-//    fn recurse_read(dir: fs::ReadDir, file_paths: &mut Vec<PathBuf>) -> Result<(), io::Error> {
-//        for entry in dir {
-//            let entry = entry?;
-//            if entry.file_type()?.is_dir() {
-//                recurse_read(fs::read_dir(&entry.path())?, file_paths)?;
-//            } else {
-//                file_paths.push(entry.path());
-//            }
-//        }
-//        Ok(())
-//    }
-//    recurse_read(dir, &mut file_paths).expect("recursing over input dir");
-//
-//    // 2. Create two dictionaries, one with our strategy, and one with theirs
-//    println!("[bench]: creating reference dict");
-//    let reference_dict =
-//        zstd::dict::from_files(file_paths.iter(), 112640).expect("create reference dict");
-//    let mut our_dict = Vec::with_capacity(112640);
-//    println!("[bench]: creating our dict");
-//    create_dict_from_dir(input_path, &mut our_dict, 112640).expect("create our dict");
-//    // Open each file and compress it
-//    let mut uncompressed_size: usize = 0;
-//    let mut nodict_size: usize = 0;
-//
-//    let mut reference_output = Dumpster(0);
-//    let mut reference_encoder =
-//        zstd::Encoder::with_dictionary(&mut reference_output, compression_level, &reference_dict)
-//            .unwrap();
-//    reference_encoder.multithread(8).unwrap();
-//    let mut our_output = Dumpster(0);
-//    let mut our_encoder =
-//        zstd::Encoder::with_dictionary(&mut our_output, compression_level, &our_dict).unwrap();
-//    our_encoder.multithread(8).unwrap();
-//    for (idx, path) in file_paths.iter().enumerate() {
-//        if idx % 10 == 0 {
-//            println!("[bench]: compressing file {}/{}", idx + 1, file_paths.len());
-//        }
-//        let mut handle = File::open(path).unwrap();
-//        let mut data = Vec::new();
-//        handle.read_to_end(&mut data).unwrap();
-//        uncompressed_size += data.len();
-//        // Compress with no dict
-//        let nodict_output = zstd::encode_all(data.as_slice(), compression_level).unwrap();
-//        nodict_size += nodict_output.len();
-//        // Compress with the reference dict
-//        reference_encoder
-//            .write_all(data.as_slice())
-//            .expect("reference writer writing");
-//        // Compress with our dict
-//        our_encoder
-//            .write_all(data.as_slice())
-//            .expect("our writer writing");
-//    }
-//
-//    BenchmarkResults {
-//        uncompressed_size,
-//        nodict_size,
-//        reference_size: reference_output.0,
-//        our_size: our_output.0,
-//    }
-//}
diff --git a/src/lib.rs b/src/lib.rs
index 7f3106eca..0f85407a4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -21,7 +21,7 @@ extern crate std;
 extern crate alloc;
 
 #[cfg(feature = "std")]
-pub(crate) const VERBOSE: bool = true;
+pub(crate) const VERBOSE: bool = false;
 
 macro_rules! vprintln {
     ($($x:expr),*) => {
@@ -52,8 +52,6 @@ pub(crate) mod fse;
 #[cfg(not(feature = "fuzz_exports"))]
 pub(crate) mod huff0;
 
-mod tests;
-
 #[cfg(feature = "std")]
 pub mod io_std;
 
@@ -65,3 +63,5 @@ pub mod io_nostd;
 
 #[cfg(not(feature = "std"))]
 pub use io_nostd as io;
+
+mod tests;
diff --git a/src/tests/mod.rs b/src/tests/mod.rs
index 15580db3c..13090296e 100644
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@@ -579,3 +579,10 @@ pub mod dict_test;
 #[cfg(feature = "std")]
 pub mod encode_corpus;
 pub mod fuzz_regressions;
+
+#[cfg(feature = "std")]
+#[test]
+fn verbose_disabled() {
+    use crate::VERBOSE;
+    assert_eq!(VERBOSE, false);
+}