Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cli/src/command/append.rs
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,7 @@ fn append_to_archive(args: AppendCommand) -> anyhow::Result<()> {
PathTransformers::new(args.substitutions, args.transforms),
false,
),
sparse: false,
};

let archive = open_archive_then_seek_to_end(&archive_path)?;
Expand Down
45 changes: 43 additions & 2 deletions cli/src/command/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
pub(crate) use self::timestamp::{TimeSource, TimestampStrategy};
use crate::{
cli::{CipherAlgorithmArgs, CompressionAlgorithmArgs, HashAlgorithmArgs, MissingTimePolicy},
utils::{self, PathPartExt, fs::HardlinkResolver},
utils::{self, PathPartExt, fs::HardlinkResolver, sparse::detect_sparse_map},
};
use anyhow::Context;

Check warning on line 25 in cli/src/command/core.rs

View workflow job for this annotation

GitHub Actions / Test WebAssembly (nightly, wasm32-wasip1)

unused import: `anyhow::Context`

Check warning on line 25 in cli/src/command/core.rs

View workflow job for this annotation

GitHub Actions / Test WebAssembly (nightly, wasm32-wasip1)

unused import: `anyhow::Context`
pub(crate) use iter::ReorderByIndex;
pub(crate) use path_filter::PathFilter;
use path_slash::*;
Expand All @@ -42,7 +42,7 @@

/// Detected format of an @archive source.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum SourceFormat {

Check warning on line 45 in cli/src/command/core.rs

View workflow job for this annotation

GitHub Actions / Test WebAssembly (nightly, wasm32-wasip1)

enum `SourceFormat` is never used

Check warning on line 45 in cli/src/command/core.rs

View workflow job for this annotation

GitHub Actions / Test WebAssembly (nightly, wasm32-wasip1)

enum `SourceFormat` is never used
/// PNA archive format (detected by magic bytes)
Pna,
/// mtree manifest format (text-based)
Expand All @@ -53,7 +53,7 @@
///
/// Returns `SourceFormat::Pna` if the data starts with PNA magic bytes,
/// otherwise returns `SourceFormat::Mtree`.
pub(crate) fn detect_format<R: io::BufRead>(reader: &mut R) -> io::Result<SourceFormat> {

Check warning on line 56 in cli/src/command/core.rs

View workflow job for this annotation

GitHub Actions / Test WebAssembly (nightly, wasm32-wasip1)

function `detect_format` is never used

Check warning on line 56 in cli/src/command/core.rs

View workflow job for this annotation

GitHub Actions / Test WebAssembly (nightly, wasm32-wasip1)

function `detect_format` is never used
let buf = reader.fill_buf()?;

Ok(if buf.starts_with(PNA_HEADER) {
Expand Down Expand Up @@ -355,6 +355,7 @@
pub(crate) option: WriteOptions,
pub(crate) keep_options: KeepOptions,
pub(crate) pathname_editor: PathnameEditor,
pub(crate) sparse: bool,
}

#[derive(Clone, Debug)]
Expand Down Expand Up @@ -810,6 +811,41 @@
Ok(())
}

/// Writes file data from a path, detecting and preserving sparse regions.
///
/// If the file is sparse, only data regions are written and the sparse map is set on the entry.
/// If the file is not sparse, falls back to normal write behavior.
pub(crate) fn write_sparse_from_path(
entry: &mut EntryBuilder,
path: impl AsRef<Path>,
) -> io::Result<()> {
use io::Seek;

let path = path.as_ref();
let mut file = fs::File::open(path)?;

if let Some(sparse_map) = detect_sparse_map(&file)? {
// Write only data regions using chunked I/O to avoid memory exhaustion
const CHUNK_SIZE: usize = 64 * 1024;
let mut buf = vec![0u8; CHUNK_SIZE];
for region in sparse_map.regions() {
file.seek(io::SeekFrom::Start(region.offset()))?;
let mut remaining = region.size();
while remaining > 0 {
let to_read = (remaining as usize).min(CHUNK_SIZE);
file.read_exact(&mut buf[..to_read])?;
entry.write_all(&buf[..to_read])?;
remaining -= to_read as u64;
}
}
Comment on lines +829 to +840
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The manual chunked reading loop for writing sparse file data regions can be simplified by using std::io::copy with a take adapter. This is more idiomatic and removes the need for manual buffer management, improving readability and maintainability. It's also important to check that the number of copied bytes matches the expected region size to handle cases where the source file might be modified during the operation.

Suggested change
const CHUNK_SIZE: usize = 64 * 1024;
let mut buf = vec![0u8; CHUNK_SIZE];
for region in sparse_map.regions() {
file.seek(io::SeekFrom::Start(region.offset()))?;
let mut remaining = region.size();
while remaining > 0 {
let to_read = (remaining as usize).min(CHUNK_SIZE);
file.read_exact(&mut buf[..to_read])?;
entry.write_all(&buf[..to_read])?;
remaining -= to_read as u64;
}
}
for region in sparse_map.regions() {
file.seek(io::SeekFrom::Start(region.offset()))?;
let mut limited_reader = (&mut file).take(region.size());
let copied = io::copy(&mut limited_reader, entry)?;
if copied != region.size() {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
format!(
"failed to read whole sparse region at offset {}: expected {} bytes, got {}",
region.offset(),
region.size(),
copied
),
));
}
}

entry.set_sparse_map(sparse_map);
Ok(())
} else {
// Not sparse, use normal write
write_from_path(entry, path)
}
}
Comment on lines +814 to +847
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Same u64usize truncation risk as in restore_sparse_file.

Line 835 has the same (remaining as usize).min(CHUNK_SIZE) pattern that can loop infinitely on 32-bit targets when remaining > u32::MAX. Apply the same fix here.

Proposed fix
-                let to_read = (remaining as usize).min(CHUNK_SIZE);
+                let to_read = usize::try_from(remaining).unwrap_or(CHUNK_SIZE).min(CHUNK_SIZE);
🤖 Prompt for AI Agents
In `@cli/src/command/core.rs` around lines 814 - 847, The loop in
write_sparse_from_path casts remaining (u64) to usize causing truncation on
32-bit targets; change the to_read calculation to compute a u64-sized min like
let to_read_u64 = remaining.min(CHUNK_SIZE as u64) and then convert to usize
(to_read = to_read_u64 as usize) before slicing buf and calling file.read_exact
and entry.write_all, ensuring you never cast the full remaining directly and
that buf[..to_read] uses the safely converted usize; update references in
write_sparse_from_path (CHUNK_SIZE, remaining, buf, file.read_exact,
entry.write_all) accordingly.


#[inline]
pub(crate) fn write_from_path(writer: &mut impl Write, path: impl AsRef<Path>) -> io::Result<()> {
let path = path.as_ref();
Expand Down Expand Up @@ -843,6 +879,7 @@
option,
keep_options,
pathname_editor,
sparse,
}: &CreateOptions,
) -> io::Result<Option<NormalEntry>> {
let CollectedEntry {
Expand All @@ -869,7 +906,11 @@
}
StoreAs::File => {
let mut entry = EntryBuilder::new_file(entry_name, option)?;
write_from_path(&mut entry, path)?;
if *sparse {
write_sparse_from_path(&mut entry, path)?;
} else {
write_from_path(&mut entry, path)?;
}
apply_metadata(entry, path, keep_options, metadata)?.build()
}
StoreAs::Dir => {
Expand Down
1 change: 1 addition & 0 deletions cli/src/command/core/mtree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ fn create_entry_from_mtree(
option,
keep_options,
pathname_editor,
sparse: _,
}: &CreateOptions,
) -> io::Result<Option<NormalEntry>> {
let entry_path = mtree_entry.path();
Expand Down
19 changes: 19 additions & 0 deletions cli/src/command/create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ use std::{
group(ArgGroup::new("user-flag").args(["numeric_owner", "uname"])),
group(ArgGroup::new("group-flag").args(["numeric_owner", "gname"])),
group(ArgGroup::new("recursive-flag").args(["recursive", "no_recursive"])),
group(ArgGroup::new("sparse-flag").args(["sparse", "no_sparse"])),
group(ArgGroup::new("keep-dir-flag").args(["keep_dir", "no_keep_dir"])),
group(ArgGroup::new("keep-xattr-flag").args(["keep_xattr", "no_keep_xattr"])),
group(ArgGroup::new("keep-timestamp-flag").args(["keep_timestamp", "no_keep_timestamp"])),
Expand Down Expand Up @@ -164,6 +165,18 @@ pub(crate) struct CreateCommand {
help = "Compress multiple files together for better compression ratio"
)]
solid: bool,
#[arg(
long,
requires = "unstable",
help = "Detect and preserve sparse files (unstable)"
)]
sparse: bool,
#[arg(
long,
requires = "unstable",
help = "Do not detect sparse files. This is the inverse option of --sparse (unstable)"
)]
no_sparse: bool,
#[arg(long, value_name = "NAME", help = "Set user name for archive entries")]
uname: Option<String>,
#[arg(long, value_name = "NAME", help = "Set group name for archive entries")]
Expand Down Expand Up @@ -533,6 +546,7 @@ fn create_archive(args: CreateCommand) -> anyhow::Result<()> {
write_option,
keep_options,
solid: args.solid,
sparse: args.sparse,
pathname_editor,
};
if let Some(size) = max_file_size {
Expand Down Expand Up @@ -568,6 +582,7 @@ pub(crate) struct CreationContext {
pub(crate) write_option: WriteOptions,
pub(crate) keep_options: KeepOptions,
pub(crate) solid: bool,
pub(crate) sparse: bool,
pub(crate) pathname_editor: PathnameEditor,
}

Expand All @@ -577,6 +592,7 @@ pub(crate) fn create_archive_file<W, F>(
write_option,
keep_options,
solid,
sparse,
pathname_editor,
}: CreationContext,
target_items: Vec<CollectedItem>,
Expand All @@ -598,6 +614,7 @@ where
option,
keep_options,
pathname_editor,
sparse,
};
let rx = spawn_entry_results(
target_items,
Expand Down Expand Up @@ -638,6 +655,7 @@ fn create_archive_with_split(
write_option,
keep_options,
solid,
sparse,
pathname_editor,
}: CreationContext,
target_items: Vec<CollectedItem>,
Expand All @@ -656,6 +674,7 @@ fn create_archive_with_split(
option,
keep_options,
pathname_editor,
sparse,
};
let rx = spawn_entry_results(
target_items,
Expand Down
74 changes: 72 additions & 2 deletions cli/src/command/extract.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@
};
use anyhow::Context;
use clap::{ArgGroup, Parser, ValueHint};
use pna::{DataKind, EntryName, EntryReference, NormalEntry, Permission, ReadOptions, prelude::*};
use pna::{
DataKind, EntryName, EntryReference, NormalEntry, Permission, ReadOptions, SparseMap,
prelude::*,
};
#[cfg(target_os = "macos")]
use std::os::macos::fs::FileTimesExt;
#[cfg(windows)]
Expand Down Expand Up @@ -899,7 +902,9 @@

match entry_kind {
DataKind::File => {
if *safe_writes {
let sparse_map = item.sparse_map();
if *safe_writes && sparse_map.is_none() {
// Safe writes (atomic rename) - only for non-sparse files
let mut safe_writer = SafeWriter::new(&path)?;
{
let mut writer =
Expand All @@ -911,6 +916,12 @@
// Set timestamps before persist; after rename we lose the file handle
restore_timestamps(safe_writer.as_file_mut(), item.metadata(), keep_options)?;
safe_writer.persist()?;
} else if let Some(sparse_map) = sparse_map {
// Sparse file restoration - write data regions at correct offsets
let mut file = utils::fs::file_create(&path, remove_existing)?;
let mut reader = item.reader(ReadOptions::with_password(password))?;
restore_sparse_file(&mut file, &mut reader, sparse_map)?;
restore_timestamps(&mut file, item.metadata(), keep_options)?;
Comment on lines +919 to +924
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Missing explicit file removal before creation in the sparse path.

The non-sparse, non-safe-writes branch (Line 926-929) explicitly removes the existing path before calling file_create when remove_existing is true. The sparse branch here skips that step. If file_create doesn't handle all edge cases (e.g., existing symlinks), this could cause extraction failures under --overwrite.

Proposed fix
         } else if let Some(sparse_map) = sparse_map {
             // Sparse file restoration - write data regions at correct offsets
+            if remove_existing {
+                utils::io::ignore_not_found(utils::fs::remove_path(&path))?;
+            }
             let mut file = utils::fs::file_create(&path, remove_existing)?;
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
} else if let Some(sparse_map) = sparse_map {
// Sparse file restoration - write data regions at correct offsets
let mut file = utils::fs::file_create(&path, remove_existing)?;
let mut reader = item.reader(ReadOptions::with_password(password))?;
restore_sparse_file(&mut file, &mut reader, sparse_map)?;
restore_timestamps(&mut file, item.metadata(), keep_options)?;
} else if let Some(sparse_map) = sparse_map {
// Sparse file restoration - write data regions at correct offsets
if remove_existing {
utils::io::ignore_not_found(utils::fs::remove_path(&path))?;
}
let mut file = utils::fs::file_create(&path, remove_existing)?;
let mut reader = item.reader(ReadOptions::with_password(password))?;
restore_sparse_file(&mut file, &mut reader, sparse_map)?;
restore_timestamps(&mut file, item.metadata(), keep_options)?;
🤖 Prompt for AI Agents
In `@cli/src/command/extract.rs` around lines 919 - 924, The sparse-file
extraction branch skips removing an existing path before creating the file,
which can fail for symlinks or other edge cases when overwrite/remove_existing
is enabled; update the sparse branch (where sparse_map is Some and you call
utils::fs::file_create, item.reader, restore_sparse_file, restore_timestamps) to
mirror the non-sparse branch by checking remove_existing and calling the same
removal helper used elsewhere (e.g., the utils::fs::remove_* path removal
function your codebase uses) on path prior to calling utils::fs::file_create,
then proceed to open the reader and call restore_sparse_file and
restore_timestamps as before.

} else {
if remove_existing {
utils::io::ignore_not_found(utils::fs::remove_path(&path))?;
Expand Down Expand Up @@ -1057,7 +1068,7 @@
MacMetadataStrategy::Always
) && item.mac_metadata().is_some();
#[cfg(not(target_os = "macos"))]
let skip_xattr_acl = false;

Check warning on line 1071 in cli/src/command/extract.rs

View workflow job for this annotation

GitHub Actions / Test WebAssembly (nightly, wasm32-wasip1)

unused variable: `skip_xattr_acl`

Check warning on line 1071 in cli/src/command/extract.rs

View workflow job for this annotation

GitHub Actions / Test WebAssembly (nightly, wasm32-wasip1)

unused variable: `skip_xattr_acl`

#[cfg(unix)]
if !skip_xattr_acl {
Expand Down Expand Up @@ -1311,6 +1322,65 @@
Ok(())
}

/// Restores a sparse file by writing only data regions and seeking over holes.
///
/// This creates a sparse file on filesystems that support it by using seek
/// to skip over hole regions instead of writing zeros.
fn restore_sparse_file(
file: &mut fs::File,
reader: &mut impl Read,
sparse_map: &SparseMap,
) -> io::Result<()> {
use io::Seek;

let expected_data_size = sparse_map.data_size().ok_or_else(|| {
io::Error::new(
io::ErrorKind::InvalidData,
"Sparse map data size overflow (corrupted archive)",
)
})?;

// Write each data region at its correct offset using chunked I/O
const CHUNK_SIZE: usize = 64 * 1024;
let mut buf = vec![0u8; CHUNK_SIZE];
let mut total_read = 0u64;
for region in sparse_map.regions() {
file.seek(io::SeekFrom::Start(region.offset()))?;
let mut remaining = region.size();
while remaining > 0 {
let to_read = (remaining as usize).min(CHUNK_SIZE);
reader.read_exact(&mut buf[..to_read]).map_err(|e| {
io::Error::new(
e.kind(),
format!(
"Failed to read sparse data at offset {}: {} \
(expected {} bytes total, read {} so far)",
region.offset(),
e,
expected_data_size,
total_read
),
)
})?;
file.write_all(&buf[..to_read])?;
remaining -= to_read as u64;
total_read += to_read as u64;
}
Comment on lines +1348 to +1368
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Potential infinite loop on 32-bit platforms due to u64usize truncation.

On a 32-bit target, remaining as usize (Line 1351) silently truncates values above u32::MAX. If the truncated result is 0, the loop will never decrease remaining, spinning forever. While 32-bit is unlikely for large sparse regions, this is a correctness gap.

Proposed fix using safe conversion
         while remaining > 0 {
-            let to_read = (remaining as usize).min(CHUNK_SIZE);
+            let to_read = usize::try_from(remaining).unwrap_or(CHUNK_SIZE).min(CHUNK_SIZE);
🤖 Prompt for AI Agents
In `@cli/src/command/extract.rs` around lines 1348 - 1368, The loop currently
converts remaining to usize with (remaining as usize) which can truncate on
32-bit targets and cause an infinite loop; change how to compute to_read by
comparing remaining with CHUNK_SIZE in u64 form (e.g. let to_read =
std::cmp::min(remaining, CHUNK_SIZE as u64) as usize) so the cast is safe, then
use &mut buf[..to_read] in reader.read_exact and file.write_all as before;
update the loop around remaining/total_read (in the function using
region.size(), region.offset(), CHUNK_SIZE, buf, reader.read_exact, and
file.write_all) accordingly to avoid any u64→usize truncation.

}

// Verify total bytes read matches expected
debug_assert_eq!(
total_read, expected_data_size,
"Sparse data size mismatch: read {}, expected {}",
total_read, expected_data_size
);

// Set the file length to the logical size (handles trailing holes)
file.set_len(sparse_map.logical_size())?;

Ok(())
}

fn ensure_directory_components(path: &Path, unlink_first: bool) -> io::Result<()> {
if path.as_os_str().is_empty() {
return Ok(());
Expand Down
3 changes: 3 additions & 0 deletions cli/src/command/stdio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -951,6 +951,7 @@ fn run_create_archive(args: StdioCommand) -> anyhow::Result<()> {
write_option: cli_option,
keep_options,
solid: args.solid,
sparse: false,
pathname_editor: PathnameEditor::new(
args.strip_components,
PathTransformers::new(args.substitutions, args.transforms),
Expand Down Expand Up @@ -1258,6 +1259,7 @@ fn run_append(args: StdioCommand) -> anyhow::Result<()> {
PathTransformers::new(args.substitutions, args.transforms),
args.absolute_paths,
),
sparse: false,
};

// NOTE: "-" will use stdin/out
Expand Down Expand Up @@ -1412,6 +1414,7 @@ fn run_update(args: StdioCommand) -> anyhow::Result<()> {
PathTransformers::new(args.substitutions, args.transforms),
args.absolute_paths,
),
sparse: false,
};

// NOTE: "-" is not supported for update mode
Expand Down
1 change: 1 addition & 0 deletions cli/src/command/update.rs
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,7 @@ fn update_archive(args: UpdateCommand) -> anyhow::Result<()> {
PathTransformers::new(args.substitutions, args.transforms),
false,
),
sparse: false,
};

let archives = collect_split_archives(&args.file.archive)?;
Expand Down
1 change: 1 addition & 0 deletions cli/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ pub(crate) mod mmap;
pub(crate) mod os;
mod path;
pub(crate) mod process;
pub(crate) mod sparse;
pub(crate) mod str;

pub(crate) use {globs::*, path::*};
Expand Down
Loading
Loading