diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 326c038..8aa08aa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: - uses: actions/checkout@v4 - name: Install Rust - uses: dtolnay/rust-toolchain@stable + uses: actions-rust-lang/setup-rust-toolchain@v1 - name: Run tests run: cargo test --all-features @@ -26,7 +26,7 @@ jobs: - uses: actions/checkout@v4 - name: Install Rust - uses: dtolnay/rust-toolchain@stable + uses: actions-rust-lang/setup-rust-toolchain@v1 with: components: rustfmt, clippy diff --git a/.gitignore b/.gitignore index 633025f..1ae9484 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ testpkg/ .claude/settings.local.json target/ +.pysubclasses-cache diff --git a/Cargo.lock b/Cargo.lock index 33f1c8b..4d73c41 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + [[package]] name = "aho-corasick" version = "1.1.3" @@ -104,6 +110,26 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + [[package]] name = "bitflags" version = "2.10.0" @@ -173,6 +199,15 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + [[package]] name = "crossbeam-deque" version = "0.8.6" @@ -216,6 +251,29 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "env_filter" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "jiff", + "log", +] + [[package]] name = "errno" version = "0.3.14" @@ -232,6 +290,16 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "flate2" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "float-cmp" version = "0.10.0" @@ -352,6 +420,30 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "jiff" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49" +dependencies = [ + "jiff-static", + "log", + "portable-atomic", + "portable-atomic-util", + "serde", +] + +[[package]] +name = "jiff-static" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "libc" version = "0.2.177" @@ -376,6 +468,16 @@ version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -441,6 +543,21 @@ dependencies = [ "siphasher", ] +[[package]] +name = "portable-atomic" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" + +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -496,8 +613,12 @@ dependencies = [ "anyhow", "assert_cmd", "assert_fs", + "bincode", "clap", + "env_logger", + "flate2", "ignore", + "log", "predicates", "rayon", "ruff_python_ast", @@ -740,6 +861,12 @@ dependencies = [ "serde_core", ] +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + [[package]] name = "siphasher" version = "1.0.1" @@ -866,12 +993,24 @@ dependencies = [ "rand", ] +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + [[package]] name = "utf8parse" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + [[package]] name = "wait-timeout" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index 0f25a9c..a8c2adf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,10 @@ clap = { version = "4", features = ["derive"] } anyhow = "1" thiserror = "1" +# Logging +log = "0.4" +env_logger = "0.11" + # File System ignore = "0.4" @@ -34,6 +38,10 @@ rayon = "1" serde = { version = "1", features = ["derive"] } serde_json = "1" +# Cache serialization and compression +bincode = "2" +flate2 = "1" + [dev-dependencies] assert_cmd = "2" assert_fs = "1" diff --git a/README.md b/README.md index 769070e..4078de3 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,8 @@ A Rust CLI tool and library for finding all subclasses (direct and transitive) o - **Gitignore support**: Automatically respects `.gitignore` files using the `ignore` crate - **Multiple output formats**: Text and JSON output formats - **Fast and efficient**: Written in Rust with parallel file traversal +- **Smart caching**: Caches parsed files with gzip compression for 2.5x speedup on repeated runs +- **Configurable logging**: Uses `env_logger` for flexible logging control ## Installation @@ -76,12 +78,38 @@ Get results in JSON format for scripting: pysubclasses Animal --format json ``` -### Verbose Mode +### Logging -Show additional information about the search process: +Control logging verbosity using the `RUST_LOG` environment variable: ```bash -pysubclasses Animal --verbose +# No logging (default) - only shows output +pysubclasses Animal + +# Show cache statistics +RUST_LOG=info pysubclasses Animal + +# Show detailed debug information +RUST_LOG=debug pysubclasses Animal + +# Show only pysubclasses logs (filter out dependencies) +RUST_LOG=pysubclasses=debug pysubclasses Animal +``` + +### Exclude Directories + +Exclude specific directories from analysis: + +```bash +pysubclasses Animal --exclude ./tests +``` + +### Disable Cache + +Force re-parsing of all files: + +```bash +pysubclasses Animal --no-cache ``` ## Examples @@ -249,6 +277,7 @@ src/ ├── parser.rs # AST parsing ├── registry.rs # Class registry ├── graph.rs # Inheritance graph +├── cache.rs # File-based caching with gzip └── error.rs # Error types tests/ diff --git a/Taskfile.yml b/Taskfile.yml index 870f49a..6adbe3d 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -1,8 +1,13 @@ version: "3" tasks: - fix.fmt: cargo fmt fix.lint: cargo clippy --all-targets --all-features --fix --allow-staged + fix.fmt: cargo fmt + + fix: + cmds: + - task: fix.lint + - task: fix.fmt check.test: cargo test --all-features check.lint: cargo clippy --all-targets --all-features -- -D warnings diff --git a/src/cache.rs b/src/cache.rs new file mode 100644 index 0000000..4b8056f --- /dev/null +++ b/src/cache.rs @@ -0,0 +1,178 @@ +//! Caching module for parsed Python files. +//! +//! This module implements a file-based cache to avoid re-parsing unchanged Python files. +//! The cache stores serialized `ParsedFile` results along with file metadata (mtime, size) +//! to detect changes. + +use crate::{ + error::Result, + parser::{self, ParsedFile}, +}; +use flate2::{Compression, read::GzDecoder, write::GzEncoder}; +use std::{ + collections::HashMap, + fs, + io::{Read, Write}, + path::{Path, PathBuf}, + time::SystemTime, +}; + +/// Metadata for a cached file. +#[derive(Debug, Clone, bincode::Encode, bincode::Decode)] +struct CacheEntry { + /// Last modified time of the source file + mtime: SystemTime, + /// Size of the source file in bytes + size: u64, + /// The parsed result + parsed: ParsedFile, +} + +/// The cache structure storing all cached parse results. +#[derive(Debug, bincode::Encode, bincode::Decode)] +struct Cache { + /// Version of the cache format (for future compatibility) + version: u32, + /// Map from file path to cache entry + entries: HashMap, +} + +impl Cache { + const VERSION: u32 = 1; + + fn new() -> Self { + Self { + version: Self::VERSION, + entries: HashMap::new(), + } + } + + fn load(cache_path: &Path) -> Option { + // Read compressed cache file + let file = fs::File::open(cache_path).ok()?; + let mut decoder = GzDecoder::new(file); + let mut decompressed = Vec::new(); + decoder.read_to_end(&mut decompressed).ok()?; + bincode::decode_from_slice(&decompressed, bincode::config::standard()) + .ok() + .map(|(cache, _)| cache) + } + + fn save(&self, cache_path: &Path) -> Result<()> { + if let Some(parent) = cache_path.parent() { + fs::create_dir_all(parent)?; + } + + // Serialize to bytes + let data = bincode::encode_to_vec(self, bincode::config::standard()) + .map_err(std::io::Error::other)?; + + // Compress and write + let file = fs::File::create(cache_path)?; + let mut encoder = GzEncoder::new(file, Compression::default()); + encoder.write_all(&data)?; + encoder.finish()?; + + Ok(()) + } +} + +/// Gets the cache file path for a given root directory. +fn get_cache_path(root_dir: &Path) -> PathBuf { + root_dir.join(".pysubclasses-cache") +} + +/// Gets file metadata for cache validation. +fn get_file_metadata(path: &Path) -> Option<(SystemTime, u64)> { + let metadata = fs::metadata(path).ok()?; + let mtime = metadata.modified().ok()?; + let size = metadata.len(); + Some((mtime, size)) +} + +/// Parses Python files with caching enabled. +/// +/// This function checks the cache for each file and only parses files that have changed +/// or are not in the cache. +/// +/// # Arguments +/// +/// * `root_dir` - The root directory (used for cache location and module path computation) +/// * `python_files` - List of Python files to parse +/// +/// # Returns +/// +/// A vector of parse results, same as `parse_files`. +pub fn parse_with_cache( + root_dir: &Path, + python_files: &[PathBuf], +) -> Result>> { + let cache_path = get_cache_path(root_dir); + + // Load existing cache + let mut cache = Cache::load(&cache_path).unwrap_or_else(Cache::new); + + // Check if cache version matches + if cache.version != Cache::VERSION { + cache = Cache::new(); + } + + let mut results = Vec::with_capacity(python_files.len()); + let mut files_to_parse = Vec::new(); + let mut cache_hits = 0; + let mut cache_misses = 0; + + // First pass: check cache + for file_path in python_files { + if let Some((mtime, size)) = get_file_metadata(file_path) + && let Some(entry) = cache.entries.get(file_path) + { + // Check if file has changed + if entry.mtime == mtime && entry.size == size { + // Cache hit + results.push(Ok(entry.parsed.clone())); + cache_hits += 1; + continue; + } + } + + // Cache miss - need to parse + files_to_parse.push(file_path.clone()); + cache_misses += 1; + } + + // Parse files that weren't in cache or have changed + if !files_to_parse.is_empty() { + let parse_results = parser::parse_files(root_dir, &files_to_parse)?; + + // Update cache and collect results + for parse_result in parse_results { + if let Ok(parsed) = &parse_result + && let Some((mtime, size)) = get_file_metadata(&parsed.file_path) + { + cache.entries.insert( + parsed.file_path.clone(), + CacheEntry { + mtime, + size, + parsed: parsed.clone(), + }, + ); + } + results.push(parse_result); + } + } + + // Save updated cache + if cache_misses > 0 + && let Err(e) = cache.save(&cache_path) + { + log::warn!("Failed to save cache: {e}"); + } + + if cache_hits > 0 || cache_misses > 0 { + log::info!("Cache: {cache_hits} hits, {cache_misses} misses"); + } + + Ok(results) +} diff --git a/src/discovery.rs b/src/discovery.rs index 5b771db..c5e802d 100644 --- a/src/discovery.rs +++ b/src/discovery.rs @@ -19,12 +19,54 @@ use crate::error::Result; /// /// A vector of paths to all `.py` files found in the directory tree. pub fn discover_python_files(root_dir: &Path) -> Result> { + discover_python_files_with_exclusions(root_dir, &[]) +} + +/// Discovers all Python files in a directory tree, excluding specified directories. +/// +/// Uses the `ignore` crate to respect `.gitignore` files and other VCS ignore patterns. +/// Additionally excludes directories specified by the caller. +/// +/// # Arguments +/// +/// * `root_dir` - The root directory to start searching from +/// * `exclude_dirs` - Directories to exclude from the search (can be relative or absolute) +/// +/// # Returns +/// +/// A vector of paths to all `.py` files found in the directory tree. +pub fn discover_python_files_with_exclusions( + root_dir: &Path, + exclude_dirs: &[PathBuf], +) -> Result> { let mut python_files = Vec::new(); + // Canonicalize exclusion paths relative to root_dir + let canonical_excludes: Vec = exclude_dirs + .iter() + .filter_map(|exclude_path| { + // Try as absolute first, then relative to root_dir + if exclude_path.is_absolute() { + exclude_path.canonicalize().ok() + } else { + root_dir.join(exclude_path).canonicalize().ok() + } + }) + .collect(); + for entry in Walk::new(root_dir) { let entry = entry.map_err(std::io::Error::other)?; let path = entry.path(); + // Check if this path is under any excluded directory + let is_excluded = canonical_excludes + .iter() + .any(|excluded| path.starts_with(excluded)); + + if is_excluded { + continue; + } + // Only include files (not directories) with .py extension if entry.file_type().is_some_and(|ft| ft.is_file()) && path diff --git a/src/lib.rs b/src/lib.rs index 8b5aa18..c8e7c5d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,7 @@ //! # } //! ``` +pub mod cache; pub mod discovery; pub mod error; pub mod graph; @@ -92,11 +93,39 @@ impl SubclassFinder { /// - The directory cannot be read /// - Any Python files cannot be parsed pub fn new(root_dir: PathBuf) -> Result { + Self::with_options(root_dir, Vec::new(), true) + } + + /// Creates a new SubclassFinder with custom options. + /// + /// # Arguments + /// + /// * `root_dir` - The root directory to search for Python files + /// * `exclude_dirs` - Directories to exclude from the search + /// * `use_cache` - Whether to use the cache for faster repeated runs + /// + /// # Errors + /// + /// Returns an error if: + /// - The directory cannot be read + /// - Any Python files cannot be parsed + pub fn with_options( + root_dir: PathBuf, + exclude_dirs: Vec, + use_cache: bool, + ) -> Result { + let root_dir = root_dir.canonicalize()?; + // Discover all Python files - let python_files = discovery::discover_python_files(&root_dir)?; + let python_files = + discovery::discover_python_files_with_exclusions(&root_dir, &exclude_dirs)?; - // Parse files in parallel and collect results - let parse_results = parser::parse_files(&root_dir, &python_files)?; + // Parse files in parallel (with optional caching) + let parse_results = if use_cache { + cache::parse_with_cache(&root_dir, &python_files)? + } else { + parser::parse_files(&root_dir, &python_files)? + }; // Log any parse errors and collect successful parses let parsed_files: Vec<_> = parse_results @@ -104,7 +133,7 @@ impl SubclassFinder { .filter_map(|result| match result { Ok(parsed) => Some(parsed), Err(e) => { - eprintln!("Warning: {e}"); + log::warn!("{e}"); None } }) diff --git a/src/main.rs b/src/main.rs index 21390e0..d6f1f55 100644 --- a/src/main.rs +++ b/src/main.rs @@ -32,13 +32,20 @@ struct Args { #[arg(short, long, default_value = ".")] directory: PathBuf, + /// Exclude directories from analysis (can be specified multiple times) + /// + /// Paths can be relative to the search directory or absolute. + /// Example: --exclude ./tests + #[arg(short, long)] + exclude: Vec, + /// Output format #[arg(short, long, value_enum, default_value = "text")] format: OutputFormat, - /// Show additional information - #[arg(short, long)] - verbose: bool, + /// Disable cache (always parse all files) + #[arg(long)] + no_cache: bool, } #[derive(Debug, Clone, Copy, clap::ValueEnum)] @@ -64,6 +71,9 @@ struct JsonClass { } fn main() -> Result<()> { + // Initialize logger + env_logger::init(); + let args = Args::parse(); // Canonicalize the directory path @@ -72,24 +82,27 @@ fn main() -> Result<()> { .canonicalize() .with_context(|| format!("Failed to access directory: {}", args.directory.display()))?; - if args.verbose { - eprintln!("Searching for Python files in: {}", root_dir.display()); + log::debug!("Searching for Python files in: {}", root_dir.display()); + if !args.exclude.is_empty() { + log::debug!("Excluding directories: {:?}", args.exclude); + } + if args.no_cache { + log::debug!("Cache disabled"); } // Create the finder (this parses all Python files) - let finder = SubclassFinder::new(root_dir).context("Failed to analyze codebase")?; - - if args.verbose { - eprintln!("Found {} classes in codebase", finder.class_count()); - eprintln!( - "Searching for subclasses of '{}'{}\n", - args.class_name, - args.module - .as_ref() - .map(|m| format!(" in module '{m}'")) - .unwrap_or_default() - ); - } + let finder = SubclassFinder::with_options(root_dir, args.exclude, !args.no_cache) + .context("Failed to analyze codebase")?; + + log::debug!("Found {} classes in codebase", finder.class_count()); + log::debug!( + "Searching for subclasses of '{}'{}", + args.class_name, + args.module + .as_ref() + .map(|m| format!(" in module '{m}'")) + .unwrap_or_default() + ); // Find subclasses let subclasses = finder @@ -111,14 +124,14 @@ fn main() -> Result<()> { // Output results match args.format { - OutputFormat::Text => output_text(&args.class_name, &subclasses, args.verbose), + OutputFormat::Text => output_text(&args.class_name, &subclasses), OutputFormat::Json => output_json(&args.class_name, &args.module, &subclasses)?, } Ok(()) } -fn output_text(class_name: &str, subclasses: &[ClassReference], verbose: bool) { +fn output_text(class_name: &str, subclasses: &[ClassReference]) { if subclasses.is_empty() { println!("No subclasses found for '{class_name}'"); return; @@ -131,16 +144,7 @@ fn output_text(class_name: &str, subclasses: &[ClassReference], verbose: bool) { ); for class_ref in subclasses { - if verbose { - println!( - " {} ({})\n └─ {}", - class_ref.class_name, - class_ref.module_path, - class_ref.file_path.display() - ); - } else { - println!(" {} ({})", class_ref.class_name, class_ref.module_path); - } + println!(" {} ({})", class_ref.class_name, class_ref.module_path); } } diff --git a/src/parser.rs b/src/parser.rs index df3254e..9a97595 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -9,7 +9,7 @@ use std::path::{Path, PathBuf}; use crate::error::{Error, Result}; /// Represents a Python class definition. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, bincode::Encode, bincode::Decode)] pub struct ClassDefinition { /// The simple name of the class pub name: String, @@ -30,14 +30,14 @@ pub struct ClassDefinition { /// `from a import b` => { imported_item=a.b, imported_as=b } /// `from a import b as c` => { imported_item=a.b, imported_as=c } /// `from a.b import c` => { imported_item=a.b.c, imported_as=c } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, bincode::Encode, bincode::Decode)] pub struct Import { pub imported_item: String, pub imported_as: String, } /// The result of parsing a Python file. -#[derive(Debug)] +#[derive(Debug, Clone, bincode::Encode, bincode::Decode)] pub struct ParsedFile { /// The path of this file pub file_path: PathBuf, diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 793a13d..dae2aef 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -179,7 +179,7 @@ fn test_verbose_output() { cmd.arg("Animal") .arg("--directory") .arg(temp.path()) - .arg("--verbose") + .env("RUST_LOG", "debug") .assert() .success() .stderr(predicate::str::contains("Searching for Python files"))