diff --git a/cli/src/command/append.rs b/cli/src/command/append.rs index ddc0fc9c1..9d416a892 100644 --- a/cli/src/command/append.rs +++ b/cli/src/command/append.rs @@ -446,6 +446,7 @@ fn append_to_archive(args: AppendCommand) -> anyhow::Result<()> { PathTransformers::new(args.substitutions, args.transforms), false, ), + sparse: false, }; let archive = open_archive_then_seek_to_end(&archive_path)?; diff --git a/cli/src/command/core.rs b/cli/src/command/core.rs index f43567f02..43edb42e4 100644 --- a/cli/src/command/core.rs +++ b/cli/src/command/core.rs @@ -20,7 +20,7 @@ pub(crate) use self::safe_writer::SafeWriter; pub(crate) use self::timestamp::{TimeSource, TimestampStrategy}; use crate::{ cli::{CipherAlgorithmArgs, CompressionAlgorithmArgs, HashAlgorithmArgs, MissingTimePolicy}, - utils::{self, PathPartExt, fs::HardlinkResolver}, + utils::{self, PathPartExt, fs::HardlinkResolver, sparse::detect_sparse_map}, }; use anyhow::Context; pub(crate) use iter::ReorderByIndex; @@ -355,6 +355,7 @@ pub(crate) struct CreateOptions { pub(crate) option: WriteOptions, pub(crate) keep_options: KeepOptions, pub(crate) pathname_editor: PathnameEditor, + pub(crate) sparse: bool, } #[derive(Clone, Debug)] @@ -810,6 +811,41 @@ fn copy_buffered(file: fs::File, writer: &mut impl Write) -> io::Result<()> { Ok(()) } +/// Writes file data from a path, detecting and preserving sparse regions. +/// +/// If the file is sparse, only data regions are written and the sparse map is set on the entry. +/// If the file is not sparse, falls back to normal write behavior. +pub(crate) fn write_sparse_from_path( + entry: &mut EntryBuilder, + path: impl AsRef, +) -> io::Result<()> { + use io::Seek; + + let path = path.as_ref(); + let mut file = fs::File::open(path)?; + + if let Some(sparse_map) = detect_sparse_map(&file)? { + // Write only data regions using chunked I/O to avoid memory exhaustion + const CHUNK_SIZE: usize = 64 * 1024; + let mut buf = vec![0u8; CHUNK_SIZE]; + for region in sparse_map.regions() { + file.seek(io::SeekFrom::Start(region.offset()))?; + let mut remaining = region.size(); + while remaining > 0 { + let to_read = (remaining as usize).min(CHUNK_SIZE); + file.read_exact(&mut buf[..to_read])?; + entry.write_all(&buf[..to_read])?; + remaining -= to_read as u64; + } + } + entry.set_sparse_map(sparse_map); + Ok(()) + } else { + // Not sparse, use normal write + write_from_path(entry, path) + } +} + #[inline] pub(crate) fn write_from_path(writer: &mut impl Write, path: impl AsRef) -> io::Result<()> { let path = path.as_ref(); @@ -843,6 +879,7 @@ pub(crate) fn create_entry( option, keep_options, pathname_editor, + sparse, }: &CreateOptions, ) -> io::Result> { let CollectedEntry { @@ -869,7 +906,11 @@ pub(crate) fn create_entry( } StoreAs::File => { let mut entry = EntryBuilder::new_file(entry_name, option)?; - write_from_path(&mut entry, path)?; + if *sparse { + write_sparse_from_path(&mut entry, path)?; + } else { + write_from_path(&mut entry, path)?; + } apply_metadata(entry, path, keep_options, metadata)?.build() } StoreAs::Dir => { diff --git a/cli/src/command/core/mtree.rs b/cli/src/command/core/mtree.rs index 2daf4d7eb..e8e8b8548 100644 --- a/cli/src/command/core/mtree.rs +++ b/cli/src/command/core/mtree.rs @@ -92,6 +92,7 @@ fn create_entry_from_mtree( option, keep_options, pathname_editor, + sparse: _, }: &CreateOptions, ) -> io::Result> { let entry_path = mtree_entry.path(); diff --git a/cli/src/command/create.rs b/cli/src/command/create.rs index ac1ebc802..558343975 100644 --- a/cli/src/command/create.rs +++ b/cli/src/command/create.rs @@ -46,6 +46,7 @@ use std::{ group(ArgGroup::new("user-flag").args(["numeric_owner", "uname"])), group(ArgGroup::new("group-flag").args(["numeric_owner", "gname"])), group(ArgGroup::new("recursive-flag").args(["recursive", "no_recursive"])), + group(ArgGroup::new("sparse-flag").args(["sparse", "no_sparse"])), group(ArgGroup::new("keep-dir-flag").args(["keep_dir", "no_keep_dir"])), group(ArgGroup::new("keep-xattr-flag").args(["keep_xattr", "no_keep_xattr"])), group(ArgGroup::new("keep-timestamp-flag").args(["keep_timestamp", "no_keep_timestamp"])), @@ -164,6 +165,18 @@ pub(crate) struct CreateCommand { help = "Compress multiple files together for better compression ratio" )] solid: bool, + #[arg( + long, + requires = "unstable", + help = "Detect and preserve sparse files (unstable)" + )] + sparse: bool, + #[arg( + long, + requires = "unstable", + help = "Do not detect sparse files. This is the inverse option of --sparse (unstable)" + )] + no_sparse: bool, #[arg(long, value_name = "NAME", help = "Set user name for archive entries")] uname: Option, #[arg(long, value_name = "NAME", help = "Set group name for archive entries")] @@ -533,6 +546,7 @@ fn create_archive(args: CreateCommand) -> anyhow::Result<()> { write_option, keep_options, solid: args.solid, + sparse: args.sparse, pathname_editor, }; if let Some(size) = max_file_size { @@ -568,6 +582,7 @@ pub(crate) struct CreationContext { pub(crate) write_option: WriteOptions, pub(crate) keep_options: KeepOptions, pub(crate) solid: bool, + pub(crate) sparse: bool, pub(crate) pathname_editor: PathnameEditor, } @@ -577,6 +592,7 @@ pub(crate) fn create_archive_file( write_option, keep_options, solid, + sparse, pathname_editor, }: CreationContext, target_items: Vec, @@ -598,6 +614,7 @@ where option, keep_options, pathname_editor, + sparse, }; let rx = spawn_entry_results( target_items, @@ -638,6 +655,7 @@ fn create_archive_with_split( write_option, keep_options, solid, + sparse, pathname_editor, }: CreationContext, target_items: Vec, @@ -656,6 +674,7 @@ fn create_archive_with_split( option, keep_options, pathname_editor, + sparse, }; let rx = spawn_entry_results( target_items, diff --git a/cli/src/command/extract.rs b/cli/src/command/extract.rs index b53b5be4d..ad973132c 100644 --- a/cli/src/command/extract.rs +++ b/cli/src/command/extract.rs @@ -26,7 +26,10 @@ use crate::{ }; use anyhow::Context; use clap::{ArgGroup, Parser, ValueHint}; -use pna::{DataKind, EntryName, EntryReference, NormalEntry, Permission, ReadOptions, prelude::*}; +use pna::{ + DataKind, EntryName, EntryReference, NormalEntry, Permission, ReadOptions, SparseMap, + prelude::*, +}; #[cfg(target_os = "macos")] use std::os::macos::fs::FileTimesExt; #[cfg(windows)] @@ -899,7 +902,9 @@ where match entry_kind { DataKind::File => { - if *safe_writes { + let sparse_map = item.sparse_map(); + if *safe_writes && sparse_map.is_none() { + // Safe writes (atomic rename) - only for non-sparse files let mut safe_writer = SafeWriter::new(&path)?; { let mut writer = @@ -911,6 +916,12 @@ where // Set timestamps before persist; after rename we lose the file handle restore_timestamps(safe_writer.as_file_mut(), item.metadata(), keep_options)?; safe_writer.persist()?; + } else if let Some(sparse_map) = sparse_map { + // Sparse file restoration - write data regions at correct offsets + let mut file = utils::fs::file_create(&path, remove_existing)?; + let mut reader = item.reader(ReadOptions::with_password(password))?; + restore_sparse_file(&mut file, &mut reader, sparse_map)?; + restore_timestamps(&mut file, item.metadata(), keep_options)?; } else { if remove_existing { utils::io::ignore_not_found(utils::fs::remove_path(&path))?; @@ -1311,6 +1322,65 @@ where Ok(()) } +/// Restores a sparse file by writing only data regions and seeking over holes. +/// +/// This creates a sparse file on filesystems that support it by using seek +/// to skip over hole regions instead of writing zeros. +fn restore_sparse_file( + file: &mut fs::File, + reader: &mut impl Read, + sparse_map: &SparseMap, +) -> io::Result<()> { + use io::Seek; + + let expected_data_size = sparse_map.data_size().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "Sparse map data size overflow (corrupted archive)", + ) + })?; + + // Write each data region at its correct offset using chunked I/O + const CHUNK_SIZE: usize = 64 * 1024; + let mut buf = vec![0u8; CHUNK_SIZE]; + let mut total_read = 0u64; + for region in sparse_map.regions() { + file.seek(io::SeekFrom::Start(region.offset()))?; + let mut remaining = region.size(); + while remaining > 0 { + let to_read = (remaining as usize).min(CHUNK_SIZE); + reader.read_exact(&mut buf[..to_read]).map_err(|e| { + io::Error::new( + e.kind(), + format!( + "Failed to read sparse data at offset {}: {} \ + (expected {} bytes total, read {} so far)", + region.offset(), + e, + expected_data_size, + total_read + ), + ) + })?; + file.write_all(&buf[..to_read])?; + remaining -= to_read as u64; + total_read += to_read as u64; + } + } + + // Verify total bytes read matches expected + debug_assert_eq!( + total_read, expected_data_size, + "Sparse data size mismatch: read {}, expected {}", + total_read, expected_data_size + ); + + // Set the file length to the logical size (handles trailing holes) + file.set_len(sparse_map.logical_size())?; + + Ok(()) +} + fn ensure_directory_components(path: &Path, unlink_first: bool) -> io::Result<()> { if path.as_os_str().is_empty() { return Ok(()); diff --git a/cli/src/command/stdio.rs b/cli/src/command/stdio.rs index fcda25c17..43ac595c8 100644 --- a/cli/src/command/stdio.rs +++ b/cli/src/command/stdio.rs @@ -951,6 +951,7 @@ fn run_create_archive(args: StdioCommand) -> anyhow::Result<()> { write_option: cli_option, keep_options, solid: args.solid, + sparse: false, pathname_editor: PathnameEditor::new( args.strip_components, PathTransformers::new(args.substitutions, args.transforms), @@ -1258,6 +1259,7 @@ fn run_append(args: StdioCommand) -> anyhow::Result<()> { PathTransformers::new(args.substitutions, args.transforms), args.absolute_paths, ), + sparse: false, }; // NOTE: "-" will use stdin/out @@ -1412,6 +1414,7 @@ fn run_update(args: StdioCommand) -> anyhow::Result<()> { PathTransformers::new(args.substitutions, args.transforms), args.absolute_paths, ), + sparse: false, }; // NOTE: "-" is not supported for update mode diff --git a/cli/src/command/update.rs b/cli/src/command/update.rs index 04d54eb2a..72bf123c8 100644 --- a/cli/src/command/update.rs +++ b/cli/src/command/update.rs @@ -450,6 +450,7 @@ fn update_archive(args: UpdateCommand) -> anyhow::Result<()> { PathTransformers::new(args.substitutions, args.transforms), false, ), + sparse: false, }; let archives = collect_split_archives(&args.file.archive)?; diff --git a/cli/src/utils.rs b/cli/src/utils.rs index 8ae715b65..74f8a112f 100644 --- a/cli/src/utils.rs +++ b/cli/src/utils.rs @@ -10,6 +10,7 @@ pub(crate) mod mmap; pub(crate) mod os; mod path; pub(crate) mod process; +pub(crate) mod sparse; pub(crate) mod str; pub(crate) use {globs::*, path::*}; diff --git a/cli/src/utils/sparse.rs b/cli/src/utils/sparse.rs new file mode 100644 index 000000000..abca8e881 --- /dev/null +++ b/cli/src/utils/sparse.rs @@ -0,0 +1,167 @@ +//! Sparse file detection utilities. + +use pna::{DataRegion, SparseMap}; +use std::fs::File; +use std::io; + +/// Detects sparse regions in a file. +/// +/// Returns `Some(SparseMap)` if the file is sparse, `None` otherwise. +/// +/// Detection strategy: +/// 1. Try SEEK_HOLE/SEEK_DATA (Linux, macOS, FreeBSD) +/// 2. If unsupported, check st_blocks vs file size +/// 3. If st_blocks indicates sparse but holes not detectable, return None +#[cfg(unix)] +pub(crate) fn detect_sparse_map(file: &File) -> io::Result> { + use std::os::unix::fs::MetadataExt; + use std::os::unix::io::AsRawFd; + + let metadata = file.metadata()?; + let file_size = metadata.len(); + + if file_size == 0 { + return Ok(None); + } + + // Try SEEK_HOLE/SEEK_DATA first + let fd = file.as_raw_fd(); + match detect_with_seek_hole_data(fd, file_size) { + Ok(Some(map)) => return Ok(Some(map)), + Ok(None) => return Ok(None), // Not sparse + Err(e) if is_seek_hole_unsupported(&e) => { + // Fall through to st_blocks check + } + Err(e) => return Err(e), + } + + // Fallback: check st_blocks + // If blocks * 512 >= size, file is not sparse + let block_bytes = metadata.blocks() * 512; + if block_bytes >= file_size { + return Ok(None); + } + + // File appears sparse by st_blocks, but we can't determine hole positions + // Return None to treat as normal file + Ok(None) +} + +#[cfg(unix)] +fn detect_with_seek_hole_data( + fd: std::os::unix::io::RawFd, + file_size: u64, +) -> io::Result> { + let mut regions = Vec::new(); + let mut pos: i64 = 0; + + loop { + // Find next data region + // SAFETY: lseek is safe to call with a valid fd + let data_start = unsafe { libc::lseek(fd, pos, libc::SEEK_DATA) }; + if data_start < 0 { + let err = io::Error::last_os_error(); + if err.raw_os_error() == Some(libc::ENXIO) { + // No more data - rest is hole + break; + } + return Err(err); + } + + // Find end of data region (next hole) + // SAFETY: lseek is safe to call with a valid fd + let hole_start = unsafe { libc::lseek(fd, data_start, libc::SEEK_HOLE) }; + if hole_start < 0 { + return Err(io::Error::last_os_error()); + } + + // Defensive check: hole_start should always be >= data_start + if hole_start < data_start { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "Invalid SEEK_HOLE result: hole_start ({}) < data_start ({})", + hole_start, data_start + ), + )); + } + + let data_size = (hole_start - data_start) as u64; + if data_size > 0 { + regions.push(DataRegion::new(data_start as u64, data_size)); + } + + pos = hole_start; + if pos as u64 >= file_size { + break; + } + } + + // Restore file position + // SAFETY: lseek is safe to call with a valid fd + let result = unsafe { libc::lseek(fd, 0, libc::SEEK_SET) }; + if result < 0 { + return Err(io::Error::last_os_error()); + } + + // Determine if file is actually sparse + if regions.is_empty() && file_size > 0 { + // Entire file is a hole + Ok(Some(SparseMap::new(file_size, vec![]))) + } else if regions.len() == 1 && regions[0].offset() == 0 && regions[0].size() == file_size { + // File is not sparse (single region covering entire file) + Ok(None) + } else { + Ok(Some(SparseMap::new(file_size, regions))) + } +} + +#[cfg(unix)] +fn is_seek_hole_unsupported(err: &io::Error) -> bool { + matches!( + err.raw_os_error(), + Some(libc::EOPNOTSUPP) | Some(libc::EINVAL) + ) +} + +#[cfg(not(unix))] +pub(crate) fn detect_sparse_map(_file: &File) -> io::Result> { + // Windows: sparse detection not implemented yet + // Future: could use FSCTL_QUERY_ALLOCATED_RANGES + Ok(None) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + + #[test] + fn detect_non_sparse_file() { + // Create a temp file using std + let dir = std::env::temp_dir(); + let path = dir.join("pna_test_sparse_nonsparse"); + let mut file = File::create(&path).unwrap(); + file.write_all(b"hello world").unwrap(); + file.flush().unwrap(); + + let file = File::open(&path).unwrap(); + let result = detect_sparse_map(&file).unwrap(); + assert!(result.is_none()); + + std::fs::remove_file(&path).ok(); + } + + #[test] + fn detect_empty_file() { + let dir = std::env::temp_dir(); + let path = dir.join("pna_test_sparse_empty"); + File::create(&path).unwrap(); + + let file = File::open(&path).unwrap(); + let result = detect_sparse_map(&file).unwrap(); + assert!(result.is_none()); + + std::fs::remove_file(&path).ok(); + } +} diff --git a/cli/tests/cli/create.rs b/cli/tests/cli/create.rs index 87349001a..45def2fe0 100644 --- a/cli/tests/cli/create.rs +++ b/cli/tests/cli/create.rs @@ -28,6 +28,8 @@ mod option_strip_components; mod password_from_file; mod password_hash; mod sanitize_parent_components; +#[cfg(unix)] +mod sparse; mod substitution; mod symlink; mod transform; diff --git a/cli/tests/cli/create/sparse.rs b/cli/tests/cli/create/sparse.rs new file mode 100644 index 000000000..0cbb0278c --- /dev/null +++ b/cli/tests/cli/create/sparse.rs @@ -0,0 +1,530 @@ +//! Sparse file support tests. + +use crate::utils::setup; +use clap::Parser; +use portable_network_archive::cli; +use std::{ + fs::{self, File}, + io::{Read, Seek, SeekFrom, Write}, + os::unix::fs::MetadataExt, + path::PathBuf, +}; + +/// Creates a sparse file and returns whether the filesystem supports sparse files. +fn create_sparse_file(path: &PathBuf) -> bool { + // Create a sparse file: [data][hole][data] + // 4KB data + 1MB hole + 4KB data = ~1MB logical size + { + let file = File::create(path).unwrap(); + // First extend the file to create a hole + file.set_len(1024 * 1024 + 4096).unwrap(); + } + { + let mut file = fs::OpenOptions::new().write(true).open(path).unwrap(); + // Write 4KB of data at the start + file.write_all(&[0xAA; 4096]).unwrap(); + // Seek to 1MB and write another 4KB + file.seek(SeekFrom::Start(1024 * 1024)).unwrap(); + file.write_all(&[0xBB; 4096]).unwrap(); + } + + // Check if the filesystem actually created a sparse file + let meta = fs::metadata(path).unwrap(); + let logical_size = meta.len(); + let block_bytes = meta.blocks() * 512; + block_bytes < logical_size +} + +/// Precondition: Sparse file with a hole in the middle. +/// Action: Create archive with `--sparse`, then extract with `--sparse`. +/// Expectation: Content matches; extracted file is sparse (st_blocks indicates holes). +#[test] +fn sparse_file_roundtrip() { + setup(); + let base = PathBuf::from("sparse_roundtrip"); + if base.exists() { + fs::remove_dir_all(&base).unwrap(); + } + fs::create_dir_all(&base).unwrap(); + + let sparse_path = base.join("sparse.bin"); + if !create_sparse_file(&sparse_path) { + eprintln!("Skipping test: filesystem does not support sparse files"); + return; + } + + // Create archive with --sparse + cli::Cli::try_parse_from([ + "pna", + "--quiet", + "c", + "sparse_roundtrip/sparse.pna", + "--overwrite", + "--sparse", + "--unstable", + "sparse_roundtrip/sparse.bin", + ]) + .unwrap() + .execute() + .unwrap(); + + // Extract (sparse files are restored automatically when archive contains SPAR chunks) + cli::Cli::try_parse_from([ + "pna", + "--quiet", + "x", + "sparse_roundtrip/sparse.pna", + "--overwrite", + "--unstable", + "--out-dir", + "sparse_roundtrip/dist", + "--strip-components", + "1", + ]) + .unwrap() + .execute() + .unwrap(); + + // Verify content matches + let original = fs::read(&sparse_path).unwrap(); + let extracted = fs::read(base.join("dist/sparse.bin")).unwrap(); + assert_eq!(original, extracted, "Content should match after roundtrip"); + + // Verify extracted file is sparse + let extracted_meta = fs::metadata(base.join("dist/sparse.bin")).unwrap(); + let extracted_blocks = extracted_meta.blocks() * 512; + let extracted_size = extracted_meta.len(); + assert!( + extracted_blocks < extracted_size, + "Extracted file should be sparse: blocks={extracted_blocks}, size={extracted_size}" + ); +} + +/// Creates an all-hole file (no data, entire file is a hole). +fn create_all_hole_file(path: &PathBuf, size: u64) -> bool { + { + let file = File::create(path).unwrap(); + file.set_len(size).unwrap(); + } + + // Check if the filesystem actually created a sparse file + // Use the same check as create_sparse_file for consistency + let meta = fs::metadata(path).unwrap(); + let block_bytes = meta.blocks() * 512; + block_bytes < size +} + +/// Precondition: All-hole file (entire file is a hole, no data). +/// Action: Create archive with `--sparse`, then extract with `--sparse`. +/// Expectation: Extracted file has correct size and reads as zeros. +#[test] +fn sparse_all_hole_file_roundtrip() { + setup(); + let base = PathBuf::from("sparse_all_hole"); + if base.exists() { + fs::remove_dir_all(&base).unwrap(); + } + fs::create_dir_all(&base).unwrap(); + + let sparse_path = base.join("hole.bin"); + let file_size = 1024 * 1024; // 1MB all-hole file + if !create_all_hole_file(&sparse_path, file_size) { + eprintln!("Skipping test: filesystem does not support sparse files"); + return; + } + + // Create archive with --sparse + cli::Cli::try_parse_from([ + "pna", + "--quiet", + "c", + "sparse_all_hole/hole.pna", + "--overwrite", + "--sparse", + "--unstable", + "sparse_all_hole/hole.bin", + ]) + .unwrap() + .execute() + .unwrap(); + + // Extract (sparse files are restored automatically when archive contains SPAR chunks) + cli::Cli::try_parse_from([ + "pna", + "--quiet", + "x", + "sparse_all_hole/hole.pna", + "--overwrite", + "--unstable", + "--out-dir", + "sparse_all_hole/dist", + "--strip-components", + "1", + ]) + .unwrap() + .execute() + .unwrap(); + + // Verify size matches + let extracted_meta = fs::metadata(base.join("dist/hole.bin")).unwrap(); + assert_eq!( + extracted_meta.len(), + file_size, + "Extracted file should have correct logical size" + ); + + // Verify content is all zeros + let mut extracted_file = File::open(base.join("dist/hole.bin")).unwrap(); + let mut buf = vec![0u8; 4096]; + let mut total_read = 0u64; + loop { + let n = extracted_file.read(&mut buf).unwrap(); + if n == 0 { + break; + } + assert!( + buf[..n].iter().all(|&b| b == 0), + "All-hole file should read as zeros" + ); + total_read += n as u64; + } + assert_eq!(total_read, file_size, "Should read entire file"); + + // Verify extracted file is sparse (minimal disk usage) + // Note: Some filesystems may not preserve sparse-ness on extraction + let extracted_blocks = extracted_meta.blocks() * 512; + if extracted_blocks >= file_size { + eprintln!( + "Note: Extracted all-hole file is not sparse (blocks={extracted_blocks}, size={file_size}). \ + This may be expected on some filesystems." + ); + } +} + +/// Creates a sparse file with multiple data regions and holes. +/// Pattern: [data][hole][data][hole][data] +fn create_multi_region_sparse_file(path: &PathBuf) -> bool { + { + let file = File::create(path).unwrap(); + // 3 data regions of 4KB each, separated by 256KB holes + // Total: 4KB + 256KB + 4KB + 256KB + 4KB = ~524KB logical + file.set_len(4096 + 256 * 1024 + 4096 + 256 * 1024 + 4096) + .unwrap(); + } + { + let mut file = fs::OpenOptions::new().write(true).open(path).unwrap(); + // Region 1: 0-4KB with pattern 0xAA + file.write_all(&[0xAA; 4096]).unwrap(); + // Region 2: 260KB-264KB with pattern 0xBB + file.seek(SeekFrom::Start(4096 + 256 * 1024)).unwrap(); + file.write_all(&[0xBB; 4096]).unwrap(); + // Region 3: 520KB-524KB with pattern 0xCC + file.seek(SeekFrom::Start(4096 + 256 * 1024 + 4096 + 256 * 1024)) + .unwrap(); + file.write_all(&[0xCC; 4096]).unwrap(); + } + + // Check if sparse + let meta = fs::metadata(path).unwrap(); + let logical_size = meta.len(); + let block_bytes = meta.blocks() * 512; + block_bytes < logical_size +} + +/// Precondition: Sparse file with multiple data regions separated by holes. +/// Action: Create archive with `--sparse`, then extract with `--sparse`. +/// Expectation: All data regions preserved with correct patterns; holes intact. +#[test] +fn sparse_multi_region_roundtrip() { + setup(); + let base = PathBuf::from("sparse_multi_region"); + if base.exists() { + fs::remove_dir_all(&base).unwrap(); + } + fs::create_dir_all(&base).unwrap(); + + let sparse_path = base.join("multi.bin"); + if !create_multi_region_sparse_file(&sparse_path) { + eprintln!("Skipping test: filesystem does not support sparse files"); + return; + } + + // Create archive with --sparse + cli::Cli::try_parse_from([ + "pna", + "--quiet", + "c", + "sparse_multi_region/multi.pna", + "--overwrite", + "--sparse", + "--unstable", + "sparse_multi_region/multi.bin", + ]) + .unwrap() + .execute() + .unwrap(); + + // Extract (sparse files are restored automatically when archive contains SPAR chunks) + cli::Cli::try_parse_from([ + "pna", + "--quiet", + "x", + "sparse_multi_region/multi.pna", + "--overwrite", + "--unstable", + "--out-dir", + "sparse_multi_region/dist", + "--strip-components", + "1", + ]) + .unwrap() + .execute() + .unwrap(); + + // Verify content matches exactly + let original = fs::read(&sparse_path).unwrap(); + let extracted = fs::read(base.join("dist/multi.bin")).unwrap(); + assert_eq!( + original, extracted, + "Content should match after roundtrip for multi-region sparse file" + ); + + // Verify specific data patterns + let expected_size = 4096 + 256 * 1024 + 4096 + 256 * 1024 + 4096; + assert_eq!(extracted.len(), expected_size); + + // Check region 1: 0xAA pattern + assert!( + extracted[..4096].iter().all(|&b| b == 0xAA), + "Region 1 should have 0xAA pattern" + ); + + // Check hole 1: zeros + assert!( + extracted[4096..4096 + 256 * 1024].iter().all(|&b| b == 0), + "Hole 1 should be zeros" + ); + + // Check region 2: 0xBB pattern + let region2_start = 4096 + 256 * 1024; + assert!( + extracted[region2_start..region2_start + 4096] + .iter() + .all(|&b| b == 0xBB), + "Region 2 should have 0xBB pattern" + ); + + // Check hole 2: zeros + let hole2_start = region2_start + 4096; + assert!( + extracted[hole2_start..hole2_start + 256 * 1024] + .iter() + .all(|&b| b == 0), + "Hole 2 should be zeros" + ); + + // Check region 3: 0xCC pattern + let region3_start = hole2_start + 256 * 1024; + assert!( + extracted[region3_start..region3_start + 4096] + .iter() + .all(|&b| b == 0xCC), + "Region 3 should have 0xCC pattern" + ); + + // Verify extracted file is sparse + let extracted_meta = fs::metadata(base.join("dist/multi.bin")).unwrap(); + let extracted_blocks = extracted_meta.blocks() * 512; + let extracted_size = extracted_meta.len(); + assert!( + extracted_blocks < extracted_size, + "Extracted multi-region file should be sparse: blocks={extracted_blocks}, size={extracted_size}" + ); +} + +/// Creates a sparse file with trailing hole (data at start, hole at end). +fn create_trailing_hole_sparse_file(path: &PathBuf) -> bool { + { + let file = File::create(path).unwrap(); + // 4KB data + 1MB trailing hole + file.set_len(4096 + 1024 * 1024).unwrap(); + } + { + let mut file = fs::OpenOptions::new().write(true).open(path).unwrap(); + file.write_all(&[0xDD; 4096]).unwrap(); + } + + let meta = fs::metadata(path).unwrap(); + let logical_size = meta.len(); + let block_bytes = meta.blocks() * 512; + block_bytes < logical_size +} + +/// Precondition: Sparse file with data at start and trailing hole. +/// Action: Create archive with `--sparse`, then extract with `--sparse`. +/// Expectation: File has correct logical size; trailing zeros preserved. +#[test] +fn sparse_trailing_hole_roundtrip() { + setup(); + let base = PathBuf::from("sparse_trailing_hole"); + if base.exists() { + fs::remove_dir_all(&base).unwrap(); + } + fs::create_dir_all(&base).unwrap(); + + let sparse_path = base.join("trailing.bin"); + if !create_trailing_hole_sparse_file(&sparse_path) { + eprintln!("Skipping test: filesystem does not support sparse files"); + return; + } + + // Create archive with --sparse + cli::Cli::try_parse_from([ + "pna", + "--quiet", + "c", + "sparse_trailing_hole/trailing.pna", + "--overwrite", + "--sparse", + "--unstable", + "sparse_trailing_hole/trailing.bin", + ]) + .unwrap() + .execute() + .unwrap(); + + // Extract (sparse files are restored automatically when archive contains SPAR chunks) + cli::Cli::try_parse_from([ + "pna", + "--quiet", + "x", + "sparse_trailing_hole/trailing.pna", + "--overwrite", + "--unstable", + "--out-dir", + "sparse_trailing_hole/dist", + "--strip-components", + "1", + ]) + .unwrap() + .execute() + .unwrap(); + + // Verify content matches + let original = fs::read(&sparse_path).unwrap(); + let extracted = fs::read(base.join("dist/trailing.bin")).unwrap(); + assert_eq!( + original, extracted, + "Content should match for trailing hole sparse file" + ); + + // Verify logical size is correct (includes trailing hole) + let expected_size = 4096 + 1024 * 1024; + assert_eq!(extracted.len(), expected_size); + + // Verify data region + assert!( + extracted[..4096].iter().all(|&b| b == 0xDD), + "Data region should have 0xDD pattern" + ); + + // Verify trailing hole is zeros + assert!( + extracted[4096..].iter().all(|&b| b == 0), + "Trailing hole should be zeros" + ); +} + +/// Creates a sparse file with leading hole (hole at start, data at end). +fn create_leading_hole_sparse_file(path: &PathBuf) -> bool { + { + let file = File::create(path).unwrap(); + // 1MB leading hole + 4KB data + file.set_len(1024 * 1024 + 4096).unwrap(); + } + { + let mut file = fs::OpenOptions::new().write(true).open(path).unwrap(); + file.seek(SeekFrom::Start(1024 * 1024)).unwrap(); + file.write_all(&[0xEE; 4096]).unwrap(); + } + + let meta = fs::metadata(path).unwrap(); + let logical_size = meta.len(); + let block_bytes = meta.blocks() * 512; + block_bytes < logical_size +} + +/// Precondition: Sparse file with leading hole and data at end. +/// Action: Create archive with `--sparse`, then extract with `--sparse`. +/// Expectation: Leading zeros preserved; data at correct offset. +#[test] +fn sparse_leading_hole_roundtrip() { + setup(); + let base = PathBuf::from("sparse_leading_hole"); + if base.exists() { + fs::remove_dir_all(&base).unwrap(); + } + fs::create_dir_all(&base).unwrap(); + + let sparse_path = base.join("leading.bin"); + if !create_leading_hole_sparse_file(&sparse_path) { + eprintln!("Skipping test: filesystem does not support sparse files"); + return; + } + + // Create archive with --sparse + cli::Cli::try_parse_from([ + "pna", + "--quiet", + "c", + "sparse_leading_hole/leading.pna", + "--overwrite", + "--sparse", + "--unstable", + "sparse_leading_hole/leading.bin", + ]) + .unwrap() + .execute() + .unwrap(); + + // Extract (sparse files are restored automatically when archive contains SPAR chunks) + cli::Cli::try_parse_from([ + "pna", + "--quiet", + "x", + "sparse_leading_hole/leading.pna", + "--overwrite", + "--unstable", + "--out-dir", + "sparse_leading_hole/dist", + "--strip-components", + "1", + ]) + .unwrap() + .execute() + .unwrap(); + + // Verify content matches + let original = fs::read(&sparse_path).unwrap(); + let extracted = fs::read(base.join("dist/leading.bin")).unwrap(); + assert_eq!( + original, extracted, + "Content should match for leading hole sparse file" + ); + + // Verify logical size + let expected_size = 1024 * 1024 + 4096; + assert_eq!(extracted.len(), expected_size); + + // Verify leading hole is zeros + assert!( + extracted[..1024 * 1024].iter().all(|&b| b == 0), + "Leading hole should be zeros" + ); + + // Verify data region + assert!( + extracted[1024 * 1024..].iter().all(|&b| b == 0xEE), + "Data region should have 0xEE pattern" + ); +} diff --git a/lib/src/chunk/types.rs b/lib/src/chunk/types.rs index b309f84d5..e7b30a79f 100644 --- a/lib/src/chunk/types.rs +++ b/lib/src/chunk/types.rs @@ -56,6 +56,7 @@ impl Error for ChunkTypeError {} /// [`FEND`](Self::FEND) (end) /// - **Solid mode**: [`SHED`](Self::SHED) (header), [`SDAT`](Self::SDAT) (data), /// [`SEND`](Self::SEND) (end) +/// - **Sparse files**: [`SPAR`](Self::SPAR) (sparse file map) /// - **Encryption**: [`PHSF`](Self::PHSF) (password hash string format) /// /// # Ancillary Chunks @@ -103,6 +104,11 @@ impl ChunkType { pub const SDAT: ChunkType = ChunkType(*b"SDAT"); /// Solid mode data stream end marker pub const SEND: ChunkType = ChunkType(*b"SEND"); + /// Sparse file map + /// + /// Contains the logical file size and a list of data regions for sparse files. + /// When present, FDAT contains only the data regions, not the full file content. + pub const SPAR: ChunkType = ChunkType(*b"SPAR"); // -- Auxiliary chunks -- /// Raw file size @@ -324,4 +330,14 @@ mod tests { fn is_safe_to_copy() { assert!(!ChunkType::AHED.is_safe_to_copy()); } + + #[test] + fn spar_chunk_properties() { + // SPAR: Critical (S=uppercase), Public (P=uppercase), + // Reserved (A=uppercase), Unsafe-to-copy (R=uppercase) + assert!(ChunkType::SPAR.is_critical()); + assert!(!ChunkType::SPAR.is_private()); + assert!(!ChunkType::SPAR.is_set_reserved()); + assert!(!ChunkType::SPAR.is_safe_to_copy()); + } } diff --git a/lib/src/entry.rs b/lib/src/entry.rs index 0a9f0eaa7..fe1513732 100644 --- a/lib/src/entry.rs +++ b/lib/src/entry.rs @@ -6,6 +6,7 @@ mod name; mod options; mod read; mod reference; +mod sparse; mod write; pub use self::{ @@ -16,6 +17,7 @@ pub use self::{ name::*, options::*, reference::*, + sparse::{DataRegion, SparseMap}, }; pub(crate) use self::{private::*, read::*, write::*}; use crate::{ @@ -593,6 +595,7 @@ pub struct NormalEntry> { pub(crate) data: Vec, pub(crate) metadata: Metadata, pub(crate) xattrs: Vec, + pub(crate) sparse_map: Option, } impl TryFrom> for NormalEntry @@ -644,6 +647,7 @@ where let mut mtime_ns = None; let mut atime_ns = None; let mut permission = None; + let mut sparse_map = None; for chunk in chunks { match chunk.ty { ChunkType::FEND => break, @@ -657,6 +661,15 @@ where compressed_size += chunk.data().len(); data.push(chunk.data); } + ChunkType::SPAR => { + if sparse_map.is_some() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Duplicate SPAR chunk in entry", + )); + } + sparse_map = Some(SparseMap::from_bytes(chunk.data())?); + } ChunkType::fSIZ => size = Some(u128_from_be_bytes_last(chunk.data())), ChunkType::cTIM => ctime = Some(timestamp(chunk.data())?), ChunkType::mTIM => mtime = Some(timestamp(chunk.data())?), @@ -695,6 +708,7 @@ where }, data, xattrs, + sparse_map, }) } } @@ -721,6 +735,9 @@ where for ex in &self.extra { total += ex.write_chunk_in(writer)?; } + if let Some(ref sparse_map) = self.sparse_map { + total += (ChunkType::SPAR, sparse_map.to_bytes()).write_chunk_in(writer)?; + } if let Some(raw_file_size) = raw_file_size { total += ( ChunkType::fSIZ, @@ -784,6 +801,9 @@ where let mut vec = Vec::new(); vec.push(RawChunk::from_data(ChunkType::FHED, self.header.to_bytes())); vec.extend(self.extra.into_iter().map(Into::into)); + if let Some(sparse_map) = self.sparse_map { + vec.push(RawChunk::from_data(ChunkType::SPAR, sparse_map.to_bytes())); + } if let Some(raw_file_size) = raw_file_size { vec.push(RawChunk::from_data( ChunkType::fSIZ, @@ -906,6 +926,17 @@ impl NormalEntry { &self.xattrs } + /// Returns the sparse map if this entry represents a sparse file. + /// + /// When a sparse map is present: + /// - The entry's FDAT data contains only the data regions (holes are omitted) + /// - Use [`SparseMap::logical_size()`] for the original file size + /// - Use [`SparseMap::regions()`] to determine where each data region belongs + #[inline] + pub fn sparse_map(&self) -> Option<&SparseMap> { + self.sparse_map.as_ref() + } + /// Extra chunks. #[inline] pub fn extra_chunks(&self) -> &[RawChunk] { @@ -1056,6 +1087,7 @@ impl<'a> From>> for NormalEntry> { data: value.data.into_iter().map(Into::into).collect(), metadata: value.metadata, xattrs: value.xattrs, + sparse_map: value.sparse_map, } } } @@ -1070,6 +1102,7 @@ impl<'a> From> for NormalEntry> { data: value.data.into_iter().map(Into::into).collect(), metadata: value.metadata, xattrs: value.xattrs, + sparse_map: value.sparse_map, } } } @@ -1084,6 +1117,7 @@ impl From>> for NormalEntry> { data: value.data.into_iter().map(Into::into).collect(), metadata: value.metadata, xattrs: value.xattrs, + sparse_map: value.sparse_map, } } } @@ -1098,6 +1132,7 @@ impl<'a> From> for NormalEntry> { data: value.data.into_iter().map(Into::into).collect(), metadata: value.metadata, xattrs: value.xattrs, + sparse_map: value.sparse_map, } } } @@ -1474,4 +1509,125 @@ mod tests { let entry = result.unwrap(); assert_eq!(entry.extra.len(), 0); } + + #[test] + fn parse_entry_with_spar() { + let spar_data = SparseMap::new(1000, vec![DataRegion::new(0, 100)]).to_bytes(); + let fhed = RawChunk::from_data(ChunkType::FHED, vec![0, 0, 0, 0, 0, 0]); + let spar = RawChunk::from_data(ChunkType::SPAR, spar_data); + let fend = RawChunk::from_data(ChunkType::FEND, vec![]); + + let raw_entry = RawEntry(vec![fhed, spar, fend]); + let entry = NormalEntry::try_from(raw_entry).unwrap(); + + let map = entry.sparse_map().expect("sparse_map should be present"); + assert_eq!(map.logical_size(), 1000); + assert_eq!(map.regions().len(), 1); + assert_eq!(map.regions()[0].offset(), 0); + assert_eq!(map.regions()[0].size(), 100); + } + + #[test] + fn parse_entry_without_spar() { + let fhed = RawChunk::from_data(ChunkType::FHED, vec![0, 0, 0, 0, 0, 0]); + let fdat = RawChunk::from_data(ChunkType::FDAT, vec![1, 2, 3, 4]); + let fend = RawChunk::from_data(ChunkType::FEND, vec![]); + + let raw_entry = RawEntry(vec![fhed, fdat, fend]); + let entry = NormalEntry::try_from(raw_entry).unwrap(); + + assert!(entry.sparse_map().is_none()); + } + + #[test] + fn reject_duplicate_spar_chunk() { + let spar_data = SparseMap::new(1000, vec![]).to_bytes(); + let fhed = RawChunk::from_data(ChunkType::FHED, vec![0, 0, 0, 0, 0, 0]); + let spar1 = RawChunk::from_data(ChunkType::SPAR, spar_data.clone()); + let spar2 = RawChunk::from_data(ChunkType::SPAR, spar_data); + let fend = RawChunk::from_data(ChunkType::FEND, vec![]); + + let raw_entry = RawEntry(vec![fhed, spar1, spar2, fend]); + let result = NormalEntry::try_from(raw_entry); + + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("Duplicate SPAR")); + } + + #[test] + fn sparse_map_preserved_in_type_conversion() { + let spar_data = SparseMap::new(500, vec![DataRegion::new(100, 200)]).to_bytes(); + let fhed = RawChunk::from_data(ChunkType::FHED, vec![0, 0, 0, 0, 0, 0]); + let spar = RawChunk::from_data(ChunkType::SPAR, spar_data); + let fend = RawChunk::from_data(ChunkType::FEND, vec![]); + + let raw_entry = RawEntry(vec![fhed, spar, fend]); + let entry: NormalEntry> = NormalEntry::try_from(raw_entry).unwrap(); + + // Convert to Cow variant + let cow_entry: NormalEntry> = entry.into(); + assert!(cow_entry.sparse_map().is_some()); + assert_eq!(cow_entry.sparse_map().unwrap().logical_size(), 500); + } + + mod sparse_entry_serialization { + use super::*; + use crate::entry::private::SealedEntryExt; + #[cfg(all(target_family = "wasm", target_os = "unknown"))] + use wasm_bindgen_test::wasm_bindgen_test as test; + + fn create_sparse_entry() -> NormalEntry> { + // Parse from raw chunks with SPAR + let spar_data = SparseMap::new( + 1000, + vec![DataRegion::new(0, 100), DataRegion::new(500, 200)], + ) + .to_bytes(); + let fhed = RawChunk::from_data(ChunkType::FHED, vec![0, 0, 0, 0, 0, 0]); + let spar = RawChunk::from_data(ChunkType::SPAR, spar_data); + let fdat = RawChunk::from_data(ChunkType::FDAT, vec![0u8; 300]); + let fend = RawChunk::from_data(ChunkType::FEND, vec![]); + + NormalEntry::try_from(RawEntry(vec![fhed, spar, fdat, fend])).unwrap() + } + + #[test] + fn into_chunks_includes_spar() { + let entry = create_sparse_entry(); + let chunks = entry.into_chunks(); + + let spar_chunk = chunks.iter().find(|c| c.ty == ChunkType::SPAR); + assert!(spar_chunk.is_some(), "SPAR chunk should be present"); + } + + #[test] + fn round_trip_sparse_entry() { + let entry = create_sparse_entry(); + let original_sparse_map = entry.sparse_map.clone(); + + let chunks = entry.into_chunks(); + let raw_entry = RawEntry(chunks); + let parsed = NormalEntry::try_from(raw_entry).unwrap(); + + assert_eq!(parsed.sparse_map, original_sparse_map); + assert_eq!(parsed.sparse_map().unwrap().logical_size(), 1000); + assert_eq!(parsed.sparse_map().unwrap().regions().len(), 2); + } + + #[test] + fn no_spar_for_non_sparse_entry() { + let fhed = RawChunk::from_data(ChunkType::FHED, vec![0, 0, 0, 0, 0, 0]); + let fdat = RawChunk::from_data(ChunkType::FDAT, vec![1, 2, 3, 4]); + let fend = RawChunk::from_data(ChunkType::FEND, vec![]); + + let entry = NormalEntry::try_from(RawEntry(vec![fhed, fdat, fend])).unwrap(); + let chunks = entry.into_chunks(); + + let spar_chunk = chunks.iter().find(|c| c.ty == ChunkType::SPAR); + assert!( + spar_chunk.is_none(), + "Non-sparse entry should not have SPAR" + ); + } + } } diff --git a/lib/src/entry/builder.rs b/lib/src/entry/builder.rs index aa6b09778..26c615707 100644 --- a/lib/src/entry/builder.rs +++ b/lib/src/entry/builder.rs @@ -6,8 +6,8 @@ use crate::{ compress::CompressionWriter, entry::{ DataKind, Entry, EntryHeader, EntryName, EntryReference, ExtendedAttribute, Metadata, - NormalEntry, Permission, SolidEntry, SolidHeader, WriteCipher, WriteOption, WriteOptions, - get_writer, get_writer_context, private::SealedEntryExt, + NormalEntry, Permission, SolidEntry, SolidHeader, SparseMap, WriteCipher, WriteOption, + WriteOptions, get_writer, get_writer_context, private::SealedEntryExt, }, io::{FlattenWriter, TryIntoInner}, }; @@ -147,6 +147,7 @@ pub struct EntryBuilder { file_size: u128, xattrs: Vec, extra_chunks: Vec, + sparse_map: Option, } impl EntryBuilder { @@ -164,6 +165,7 @@ impl EntryBuilder { file_size: 0, xattrs: Vec::new(), extra_chunks: Vec::new(), + sparse_map: None, } } @@ -423,6 +425,26 @@ impl EntryBuilder { self } + /// Sets the sparse map for this entry. + /// + /// When a sparse map is set, the entry will include a SPAR chunk. + /// The caller is responsible for writing only the data regions, not the full file content. + /// + /// # Important + /// + /// The total bytes written via [`Write`] must equal [`SparseMap::data_size()`], + /// not [`SparseMap::logical_size()`]. The sparse map describes where each written + /// byte belongs in the logical file space. + /// + /// # Returns + /// + /// A mutable reference to the [`EntryBuilder`] with the sparse map set. + #[inline] + pub fn set_sparse_map(&mut self, sparse_map: SparseMap) -> &mut Self { + self.sparse_map = Some(sparse_map); + self + } + /// Sets the maximum chunk size for data written to this entry. /// /// This controls how the entry data is split into chunks when writing. @@ -479,7 +501,27 @@ impl EntryBuilder { if let Some(iv) = self.iv { data.insert(0, iv); } + // Validate sparse_map data size matches written data + if let Some(ref sparse_map) = self.sparse_map { + let expected = sparse_map.data_size().ok_or_else(|| { + io::Error::new(io::ErrorKind::InvalidData, "Sparse map data size overflow") + })? as u128; + if self.file_size != expected { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "Sparse map data_size ({}) does not match written bytes ({})", + expected, self.file_size + ), + )); + } + } + let metadata = Metadata { + // NOTE: For sparse files, raw_file_size is the data size (bytes in FDAT), + // not the logical file size. Use sparse_map.logical_size() for the original + // file size. This keeps raw_file_size consistent: it always represents + // the uncompressed size of data stored in the archive. raw_file_size: match (self.store_file_size, self.header.data_kind) { (true, DataKind::File) => Some(self.file_size), _ => None, @@ -497,6 +539,7 @@ impl EntryBuilder { data, metadata, xattrs: self.xattrs, + sparse_map: self.sparse_map, }) } } @@ -847,4 +890,69 @@ mod tests { assert_eq!("テストデータ".as_bytes(), &buf[..]); } + + #[test] + fn entry_builder_with_sparse_map() -> io::Result<()> { + use crate::entry::sparse::DataRegion; + + let sparse_map = SparseMap::new( + 1000, + vec![DataRegion::new(0, 100), DataRegion::new(500, 200)], + ); + + let mut builder = EntryBuilder::new_file("sparse.bin".into(), WriteOptions::store())?; + builder.set_sparse_map(sparse_map); + builder.write_all(&[0u8; 300])?; // Write data regions + let entry = builder.build()?; + + let map = entry.sparse_map().expect("sparse_map should be present"); + assert_eq!(map.logical_size(), 1000); + assert_eq!(map.data_size(), Some(300)); + assert_eq!(map.regions().len(), 2); + Ok(()) + } + + #[test] + fn entry_builder_without_sparse_map() -> io::Result<()> { + let mut builder = EntryBuilder::new_file("normal.bin".into(), WriteOptions::store())?; + builder.write_all(&[1, 2, 3, 4])?; + let entry = builder.build()?; + + assert!(entry.sparse_map().is_none()); + Ok(()) + } + + #[test] + fn entry_builder_all_hole_sparse() -> io::Result<()> { + // File is 1GB logically but contains no data + let sparse_map = SparseMap::new(1024 * 1024 * 1024, vec![]); + + let mut builder = EntryBuilder::new_file("hole.bin".into(), WriteOptions::store())?; + builder.set_sparse_map(sparse_map); + // Don't write any data + let entry = builder.build()?; + + let map = entry.sparse_map().expect("sparse_map should be present"); + assert!(map.is_all_hole()); + assert_eq!(map.logical_size(), 1024 * 1024 * 1024); + assert_eq!(map.data_size(), Some(0)); + Ok(()) + } + + #[test] + fn entry_builder_sparse_map_size_mismatch() -> io::Result<()> { + use crate::entry::sparse::DataRegion; + + let sparse_map = SparseMap::new(1000, vec![DataRegion::new(0, 100)]); + + let mut builder = EntryBuilder::new_file("sparse.bin".into(), WriteOptions::store())?; + builder.set_sparse_map(sparse_map); + builder.write_all(&[0u8; 50])?; // Write less than expected + let result = builder.build(); + + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(err.to_string().contains("does not match")); + Ok(()) + } } diff --git a/lib/src/entry/sparse.rs b/lib/src/entry/sparse.rs new file mode 100644 index 000000000..503973cb7 --- /dev/null +++ b/lib/src/entry/sparse.rs @@ -0,0 +1,444 @@ +//! Sparse file map support. +//! +//! This module provides types for representing sparse file metadata in PNA archives. +//! A sparse file contains "holes" - regions that read as zeros but don't occupy disk space. + +use std::io; + +/// A region of actual data in a sparse file. +/// +/// Represents a contiguous block of data at a specific offset within the logical file. +/// The data for this region is stored contiguously in the archive's FDAT chunks. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct DataRegion { + offset: u64, + size: u64, +} + +impl DataRegion { + /// Creates a new data region. + /// + /// # Arguments + /// + /// * `offset` - Byte offset in the logical file where this data region starts + /// * `size` - Size of the data region in bytes + #[inline] + pub const fn new(offset: u64, size: u64) -> Self { + Self { offset, size } + } + + /// Returns the byte offset in the logical file. + #[inline] + pub const fn offset(&self) -> u64 { + self.offset + } + + /// Returns the size of the data region in bytes. + #[inline] + pub const fn size(&self) -> u64 { + self.size + } + + /// Returns the exclusive end offset: `offset + size`. + /// + /// Uses saturating arithmetic to prevent overflow. If `offset + size` + /// would exceed `u64::MAX`, returns `u64::MAX` instead. + #[inline] + pub const fn end(&self) -> u64 { + self.offset.saturating_add(self.size) + } +} + +/// Sparse file map describing data regions within a file. +/// +/// The map contains the logical file size and an ordered list of data regions. +/// Gaps between data regions are holes that read as zeros. +/// +/// # SPAR Chunk Format +/// +/// ```text +/// +------------------+------------------+------------------+-----+ +/// | logical_size | offset_0 | size_0 | ... | +/// | (8 bytes, u64) | (8 bytes, u64) | (8 bytes, u64) | | +/// +------------------+------------------+------------------+-----+ +/// ``` +/// +/// All values are unsigned 64-bit integers in big-endian byte order. +/// +/// # Invariants +/// +/// - Regions are sorted by offset in ascending order +/// - Regions do not overlap: `regions[i].end() <= regions[i+1].offset()` +/// - All regions are within bounds: `region.end() <= logical_size` +#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct SparseMap { + logical_size: u64, + regions: Vec, +} + +impl SparseMap { + /// Creates a new sparse map. + /// + /// # Arguments + /// + /// * `logical_size` - The total logical size of the file (including holes) + /// * `regions` - Data regions in ascending offset order + /// + /// # Panics + /// + /// Panics if regions violate invariants: + /// - Not sorted by offset + /// - Overlapping regions + /// - Region extends beyond logical_size + #[inline] + pub fn new(logical_size: u64, regions: Vec) -> Self { + Self::validate_regions(logical_size, ®ions); + Self { + logical_size, + regions, + } + } + + fn validate_regions(logical_size: u64, regions: &[DataRegion]) { + for i in 1..regions.len() { + assert!( + regions[i - 1].offset < regions[i].offset, + "regions must be sorted by offset in ascending order: {} >= {}", + regions[i - 1].offset, + regions[i].offset + ); + assert!( + regions[i - 1].end() <= regions[i].offset, + "regions must not overlap: region {} ends at {}, region {} starts at {}", + i - 1, + regions[i - 1].end(), + i, + regions[i].offset + ); + } + if let Some(last) = regions.last() { + assert!( + last.end() <= logical_size, + "region must be within logical size: region ends at {}, logical size is {}", + last.end(), + logical_size + ); + } + } + + /// Returns the logical file size (total size including holes). + #[inline] + pub const fn logical_size(&self) -> u64 { + self.logical_size + } + + /// Returns the data regions. + #[inline] + pub fn regions(&self) -> &[DataRegion] { + &self.regions + } + + /// Returns the total size of all data regions (actual data, excludes holes). + /// + /// This is the amount of data stored in the archive's FDAT chunks. + /// Returns `None` if the total would overflow u64 (malformed archive). + #[inline] + pub fn data_size(&self) -> Option { + self.regions + .iter() + .try_fold(0u64, |acc, r| acc.checked_add(r.size)) + } + + /// Returns `true` if there are no data regions (entire file is a hole). + #[inline] + pub fn is_all_hole(&self) -> bool { + self.regions.is_empty() + } + + /// Parses a sparse map from SPAR chunk data. + /// + /// # Errors + /// + /// Returns an error if: + /// - Data is too small (less than 8 bytes) + /// - Data size is not 8 + n*16 bytes + /// - Regions are not in ascending offset order + /// - Regions overlap + /// - A region extends beyond the logical size + /// + /// # Panics + /// + /// This function does not panic; all slice operations are bounds-checked + /// by the length validations above. + #[inline] + pub fn from_bytes(data: &[u8]) -> io::Result { + if data.len() < 8 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "SPAR chunk too small: {} bytes, expected at least 8", + data.len() + ), + )); + } + if !(data.len() - 8).is_multiple_of(16) { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "SPAR chunk has invalid size: {} bytes (expected 8 + n*16)", + data.len() + ), + )); + } + + // SAFETY: length checks above guarantee these slices are exactly 8 bytes + let logical_size = u64::from_be_bytes(data[0..8].try_into().expect("checked")); + let entry_count = (data.len() - 8) / 16; + let mut regions = Vec::with_capacity(entry_count); + + for i in 0..entry_count { + let base = 8 + i * 16; + // SAFETY: is_multiple_of check guarantees these slices exist + let offset = u64::from_be_bytes(data[base..base + 8].try_into().expect("checked")); + let size = u64::from_be_bytes(data[base + 8..base + 16].try_into().expect("checked")); + regions.push(DataRegion::new(offset, size)); + } + + // Validate constraints per SPAR specification + for i in 1..regions.len() { + if regions[i - 1].offset >= regions[i].offset { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "SPAR regions must be in ascending offset order: \ + region {} offset {} >= region {} offset {}", + i - 1, + regions[i - 1].offset, + i, + regions[i].offset + ), + )); + } + if regions[i - 1].end() > regions[i].offset { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "SPAR regions must not overlap: \ + region {} ends at {}, region {} starts at {}", + i - 1, + regions[i - 1].end(), + i, + regions[i].offset + ), + )); + } + } + if let Some(last) = regions.last() + && last.end() > logical_size + { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "SPAR region exceeds logical size: \ + region ends at {}, logical size is {}", + last.end(), + logical_size + ), + )); + } + + Ok(Self { + logical_size, + regions, + }) + } + + /// Serializes the sparse map to SPAR chunk data. + /// + /// The returned bytes can be used as the data portion of a SPAR chunk. + #[inline] + pub fn to_bytes(&self) -> Vec { + let mut data = Vec::with_capacity(8 + self.regions.len() * 16); + data.extend_from_slice(&self.logical_size.to_be_bytes()); + for region in &self.regions { + data.extend_from_slice(®ion.offset.to_be_bytes()); + data.extend_from_slice(®ion.size.to_be_bytes()); + } + data + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[cfg(all(target_family = "wasm", target_os = "unknown"))] + use wasm_bindgen_test::wasm_bindgen_test as test; + + #[test] + fn data_region_accessors() { + let region = DataRegion::new(100, 50); + assert_eq!(region.offset(), 100); + assert_eq!(region.size(), 50); + assert_eq!(region.end(), 150); + } + + #[test] + fn data_region_zero_size() { + let region = DataRegion::new(100, 0); + assert_eq!(region.offset(), 100); + assert_eq!(region.size(), 0); + assert_eq!(region.end(), 100); + } + + #[test] + fn sparse_map_round_trip() { + let map = SparseMap::new( + 1000, + vec![DataRegion::new(0, 100), DataRegion::new(500, 200)], + ); + let bytes = map.to_bytes(); + let parsed = SparseMap::from_bytes(&bytes).unwrap(); + assert_eq!(map, parsed); + } + + #[test] + fn sparse_map_empty_regions() { + let map = SparseMap::new(1000, vec![]); + assert!(map.is_all_hole()); + assert_eq!(map.data_size(), Some(0)); + assert_eq!(map.logical_size(), 1000); + + let bytes = map.to_bytes(); + assert_eq!(bytes.len(), 8); // Only logical_size + + let parsed = SparseMap::from_bytes(&bytes).unwrap(); + assert_eq!(parsed.logical_size(), 1000); + assert!(parsed.regions().is_empty()); + } + + #[test] + fn sparse_map_single_region_at_start() { + let map = SparseMap::new(1000, vec![DataRegion::new(0, 100)]); + assert!(!map.is_all_hole()); + assert_eq!(map.data_size(), Some(100)); + + let bytes = map.to_bytes(); + assert_eq!(bytes.len(), 8 + 16); // logical_size + 1 region + + let parsed = SparseMap::from_bytes(&bytes).unwrap(); + assert_eq!(parsed.regions().len(), 1); + assert_eq!(parsed.regions()[0].offset(), 0); + assert_eq!(parsed.regions()[0].size(), 100); + } + + #[test] + fn sparse_map_data_size() { + let map = SparseMap::new( + 1000, + vec![DataRegion::new(0, 100), DataRegion::new(500, 200)], + ); + assert_eq!(map.data_size(), Some(300)); + } + + #[test] + fn sparse_map_adjacent_regions() { + // Adjacent but not overlapping: [0-100] and [100-200] + let map = SparseMap::new( + 200, + vec![DataRegion::new(0, 100), DataRegion::new(100, 100)], + ); + assert_eq!(map.data_size(), Some(200)); + + let bytes = map.to_bytes(); + let parsed = SparseMap::from_bytes(&bytes).unwrap(); + assert_eq!(parsed.regions().len(), 2); + } + + #[test] + fn sparse_map_invalid_order() { + // Regions out of order: 500 before 0 + #[rustfmt::skip] + let data = [ + 0, 0, 0, 0, 0, 0, 3, 232, // logical_size = 1000 + 0, 0, 0, 0, 0, 0, 1, 244, // offset = 500 + 0, 0, 0, 0, 0, 0, 0, 100, // size = 100 + 0, 0, 0, 0, 0, 0, 0, 0, // offset = 0 (out of order!) + 0, 0, 0, 0, 0, 0, 0, 100, // size = 100 + ]; + let result = SparseMap::from_bytes(&data); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("ascending")); + } + + #[test] + fn sparse_map_overlapping_regions() { + // Regions overlap: [0-100] and [50-150] + #[rustfmt::skip] + let data = [ + 0, 0, 0, 0, 0, 0, 3, 232, // logical_size = 1000 + 0, 0, 0, 0, 0, 0, 0, 0, // offset = 0 + 0, 0, 0, 0, 0, 0, 0, 100, // size = 100 (ends at 100) + 0, 0, 0, 0, 0, 0, 0, 50, // offset = 50 (overlaps!) + 0, 0, 0, 0, 0, 0, 0, 100, // size = 100 + ]; + let result = SparseMap::from_bytes(&data); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("overlap")); + } + + #[test] + fn sparse_map_exceeds_logical_size() { + // Region exceeds logical size + #[rustfmt::skip] + let data = [ + 0, 0, 0, 0, 0, 0, 0, 100, // logical_size = 100 + 0, 0, 0, 0, 0, 0, 0, 50, // offset = 50 + 0, 0, 0, 0, 0, 0, 0, 100, // size = 100 (ends at 150 > 100) + ]; + let result = SparseMap::from_bytes(&data); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("exceeds")); + } + + #[test] + fn sparse_map_too_small() { + let data = [0, 0, 0, 0]; // Less than 8 bytes + let result = SparseMap::from_bytes(&data); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("too small")); + } + + #[test] + fn sparse_map_invalid_length() { + // 11 bytes: not 8 + n*16 + let data = [0, 0, 0, 0, 0, 0, 0, 100, 0, 0, 0]; + let result = SparseMap::from_bytes(&data); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("invalid size")); + } + + #[test] + fn sparse_map_zero_logical_size_empty_regions() { + let map = SparseMap::new(0, vec![]); + assert!(map.is_all_hole()); + assert_eq!(map.logical_size(), 0); + + let bytes = map.to_bytes(); + let parsed = SparseMap::from_bytes(&bytes).unwrap(); + assert_eq!(parsed.logical_size(), 0); + } + + #[test] + fn sparse_map_data_size_overflow() { + // Manually construct a SparseMap with regions that would overflow u64. + // We bypass validation by directly constructing the struct. + let map = SparseMap { + logical_size: u64::MAX, + regions: vec![ + DataRegion::new(0, u64::MAX), + DataRegion::new(u64::MAX, 1), // This would overflow the sum + ], + }; + assert_eq!(map.data_size(), None); + } +} diff --git a/lib/src/lib.rs b/lib/src/lib.rs index b97a8a4af..49699dde4 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -117,6 +117,7 @@ //! - [`WriteOptions`] - Configuration for compression and encryption when writing //! - [`ReadOptions`] - Configuration (password) for reading encrypted entries //! - [`NormalEntry`] / [`SolidEntry`] / [`ReadEntry`] - Entry types for reading +//! - [`SparseMap`] / [`DataRegion`] - Sparse file representation //! - [`Chunk`] / [`ChunkType`] - Low-level chunk primitives //! //! # Feature Flags