diff --git a/.vscode/cspell.dictionaries/workspace.wordlist.txt b/.vscode/cspell.dictionaries/workspace.wordlist.txt index 16769ece5ce..68f8175ea4d 100644 --- a/.vscode/cspell.dictionaries/workspace.wordlist.txt +++ b/.vscode/cspell.dictionaries/workspace.wordlist.txt @@ -42,6 +42,9 @@ langid lscolors mdbook memchr +memmap +mmap +Mmap multifilereader onig ouroboros diff --git a/Cargo.lock b/Cargo.lock index 10b9a7553d5..743e40fdd9a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4116,6 +4116,7 @@ dependencies = [ "itertools 0.14.0", "libc", "memchr", + "memmap2", "rand 0.10.1", "rayon", "rustix", diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 6d949006efd..d0d8282e07f 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -1275,6 +1275,15 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "memmap2" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" +dependencies = [ + "libc", +] + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -2050,6 +2059,7 @@ dependencies = [ "itertools", "libc", "memchr", + "memmap2", "rand", "rayon", "rustix", diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index db51104c096..707daac201f 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -31,6 +31,7 @@ clap = { workspace = true } compare = { workspace = true } itertools = { workspace = true } memchr = { workspace = true } +memmap2 = { workspace = true } rand = { workspace = true } rayon = { workspace = true } self_cell = { workspace = true } diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index 85276fce6d8..9289b541929 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -13,55 +13,32 @@ use std::{ cmp::Ordering, - ffi::{OsStr, OsString}, + ffi::OsString, fs::{self, File}, io::{BufWriter, Read, Write}, iter, path::{Path, PathBuf}, process::{Child, ChildStdin, ChildStdout, Command, Stdio}, rc::Rc, - sync::mpsc::{Receiver, Sender, SyncSender, channel, sync_channel}, + sync::{ + Arc, + mpsc::{Receiver, Sender, SyncSender, channel, sync_channel}, + }, thread::{self, JoinHandle}, }; use compare::Compare; +use memmap2::Mmap as MemoryMap; use uucore::error::{FromIo, UResult}; use crate::{ GlobalSettings, Output, SortError, chunks::{self, Chunk, RecycledChunk}, - compare_by, current_open_fd_count, fd_soft_limit, open, + compare_by, current_open_fd_count, fd_soft_limit, + sort_inputs::{OpenedInput, SortInputs}, tmp_dir::TmpDirWrapper, }; -/// If the output file occurs in the input files as well, copy the contents of the output file -/// and replace its occurrences in the inputs with that copy. -fn replace_output_file_in_input_files( - files: &mut [OsString], - output: Option<&OsStr>, - tmp_dir: &mut TmpDirWrapper, -) -> UResult<()> { - let mut copy: Option = None; - if let Some(Ok(output_path)) = output.map(|path| Path::new(path).canonicalize()) { - for file in files { - if let Ok(file_path) = Path::new(file).canonicalize() { - if file_path == output_path { - if let Some(copy) = © { - *file = copy.clone().into_os_string(); - } else { - let (_file, copy_path) = tmp_dir.next_file()?; - fs::copy(file_path, ©_path) - .map_err(|error| SortError::OpenTmpFileFailed { error })?; - *file = copy_path.clone().into_os_string(); - copy = Some(copy_path); - } - } - } - } - } - Ok(()) -} - /// Determine the effective merge batch size, enforcing a minimum and respecting the /// file-descriptor soft limit after reserving stdio/output and a safety margin. fn effective_merge_batch_size(settings: &GlobalSettings) -> usize { @@ -89,6 +66,41 @@ fn effective_merge_batch_size(settings: &GlobalSettings) -> usize { batch_size } +/// If the output file is also listed as an input, use memory-map to load it before +/// it gets opened for writing. This allows reading the original content +/// via memory-map while writing to the same file, without needing a temp copy. +fn load_output_as_input( + output: &Output, + files: &[OsString], +) -> UResult)>> { + let Some(name) = output.as_output_name() else { + return Ok(None); + }; + let output_path = Path::new(name).canonicalize()?; + let appears = files + .iter() + .any(|f| Path::new(f).canonicalize().is_ok_and(|p| p == output_path)); + if appears { + let read_fd = File::open(name).map_err(|error| SortError::ReadFailed { + path: output_path.clone(), + error, + })?; + // SAFETY: We keep the read_fd open for the lifetime of the memory-map, + // and we only read from it and the file is not modified while the + // memory-map exists by the current process (writing happens later via a separate FD). + // Yet, it may be possible that another process alter the content and consequently read corrupted data + let output_as_input = Arc::new(unsafe { MemoryMap::map(&read_fd) }.map_err(|error| { + SortError::ReadFailed { + path: output_path.clone(), + error, + } + })?); + Ok(Some((output_path, output_as_input))) + } else { + Ok(None) + } +} + /// Merge pre-sorted `Box`s. /// /// If `settings.merge_batch_size` is greater than the length of `files`, intermediate files will be used. @@ -99,10 +111,12 @@ pub fn merge( output: Output, tmp_dir: &mut TmpDirWrapper, ) -> UResult<()> { - replace_output_file_in_input_files(files, output.as_output_name(), tmp_dir)?; - let files = files - .iter() - .map(|file| open(file).map(|file| PlainMergeInput { inner: file })); + let output_as_input = load_output_as_input(&output, files)?; + + let sort_inputs = SortInputs::from_files_with_output(files, output_as_input); + let files = sort_inputs + .into_iter() + .map(|result| result.map(|input| PlainMergeInput:: { inner: input })); if settings.compress_prog.is_none() { merge_with_file_limit::<_, _, WriteablePlainTmpFile>(files, settings, output, tmp_dir) } else { @@ -596,3 +610,63 @@ impl MergeInput for PlainMergeInput { &mut self.inner } } + +#[cfg(test)] +#[cfg(not(target_os = "wasi"))] +mod tests { + use super::*; + use std::io::Write; + use tempfile::NamedTempFile; + + #[test] + fn test_merge_output_as_input() { + // Setup: output file 'out' contains "6\n", inputs in/1..in/5 contain 1..5 + let mut out = NamedTempFile::new().unwrap(); + out.write_all(b"6\n").unwrap(); + out.flush().unwrap(); + let out_path = out.path().as_os_str().to_os_string(); + + let inputs: Vec = (1..=5) + .map(|i| { + let mut f = NamedTempFile::new().unwrap(); + writeln!(f, "{i}").unwrap(); + f.flush().unwrap(); + f + }) + .collect(); + + let files = vec![ + out_path.clone(), + inputs[0].path().as_os_str().to_os_string(), + inputs[1].path().as_os_str().to_os_string(), + inputs[2].path().as_os_str().to_os_string(), + inputs[3].path().as_os_str().to_os_string(), + inputs[4].path().as_os_str().to_os_string(), + out_path.clone(), + ]; + + // Check Opened SortInputs: 7 inputs but only 6 unique sources + let output_canon = out.path().canonicalize().unwrap(); + let read_fd = File::open(out.path()).unwrap(); + let output_as_input = Arc::new(unsafe { MemoryMap::map(&read_fd).unwrap() }); + + let sort_inputs = + SortInputs::from_files_with_output(&files, Some((output_canon, output_as_input))); + + assert_eq!(sort_inputs.len(), 7); + assert_eq!(sort_inputs.unique_count(), 6); + + // Run merge + let mut files_mut = files.clone(); + let settings = GlobalSettings::default(); + let output = Output::new(Some(out_path.as_os_str())).unwrap(); + let mut tmp_dir = TmpDirWrapper::new(std::env::temp_dir()); + + merge(&mut files_mut, &settings, output, &mut tmp_dir).unwrap(); + + // If merge succeeded with only 6 unique sources, memory-map deduplication worked. + // Verify correctness. + let result = fs::read_to_string(out.path()).unwrap(); + assert_eq!(result, "1\n2\n3\n4\n5\n6\n6\n"); + } +} diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index d27f6eadb1a..1dd143772e3 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -17,6 +17,7 @@ mod custom_str_cmp; mod ext_sort; mod merge; mod numeric_str_cmp; +mod sort_inputs; mod tmp_dir; use bigdecimal::BigDecimal; diff --git a/src/uu/sort/src/sort_inputs/iterator.rs b/src/uu/sort/src/sort_inputs/iterator.rs new file mode 100644 index 00000000000..053a3821e91 --- /dev/null +++ b/src/uu/sort/src/sort_inputs/iterator.rs @@ -0,0 +1,295 @@ +use crate::SortError; +use crate::sort_inputs::{DeferredInput, InputAccess, OpenedInput, SortInput, SortInputs}; +use memmap2::Mmap as MemoryMap; +use std::{collections::HashMap, fs::File, path::PathBuf, sync::Arc, vec::IntoIter}; +use uucore::error::UResult; + +/// Iterator that opens deferred input entries as they are yielded. +#[derive(Debug, Default)] +pub struct SortInputsIntoIter { + inner: IntoIter, + memory_map_files: HashMap>, +} + +impl SortInputsIntoIter { + fn open_file(path: PathBuf) -> UResult { + File::open(&path) + .map(OpenedInput::File) + .map_err(|error| SortError::ReadFailed { path, error }.into()) + } + + fn shared_memory_map(&mut self, path: PathBuf) -> UResult { + if let Some(memory_map) = self.memory_map_files.get(&path) { + return Ok(OpenedInput::SharedMemoryMap { + data: memory_map.clone(), + offset: 0, + }); + } + + let file = File::open(&path).map_err(|error| SortError::ReadFailed { + path: path.clone(), + error, + })?; + + // SAFETY: This creates a read-only memory map for an input file. The map is + // only exposed through `OpenedInput::SharedMemoryMap`, which implements + // `Read` by copying bytes out and never mutates the mapped region. + let memory_map = + Arc::new( + unsafe { MemoryMap::map(&file) }.map_err(|error| SortError::ReadFailed { + path: path.clone(), + error, + })?, + ); + + self.memory_map_files.insert(path, memory_map.clone()); + + Ok(OpenedInput::SharedMemoryMap { + data: memory_map, + offset: 0, + }) + } +} + +impl Iterator for SortInputsIntoIter { + type Item = UResult; + + fn next(&mut self) -> Option { + let input = self.inner.next()?; + + let opened_input = match input.inner { + DeferredInput::Stdin => Ok(OpenedInput::Stdin), + DeferredInput::Path { + path, + access: InputAccess::OpenFile, + } => Self::open_file(path), + DeferredInput::Path { + path, + access: InputAccess::SharedMemoryMap, + } => self.shared_memory_map(path), + DeferredInput::OutputSnapshot(data) => { + Ok(OpenedInput::SharedMemoryMap { data, offset: 0 }) + } + }; + + Some(opened_input) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +impl ExactSizeIterator for SortInputsIntoIter { + fn len(&self) -> usize { + self.inner.len() + } +} + +impl IntoIterator for SortInputs { + type Item = UResult; + type IntoIter = SortInputsIntoIter; + + fn into_iter(self) -> Self::IntoIter { + SortInputsIntoIter { + inner: self.inputs.into_iter(), + memory_map_files: HashMap::new(), + } + } +} + +#[cfg(test)] +mod test { + use super::SortInputsIntoIter; + use crate::sort_inputs::{DeferredInput, InputAccess, OpenedInput, SortInput, SortInputs}; + use memmap2::Mmap as MemoryMap; + #[cfg(not(target_os = "wasi"))] + use std::{ + fs::File, + io::{Read, Write}, + sync::Arc, + }; + #[cfg(not(target_os = "wasi"))] + use tempfile::NamedTempFile; + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_open_file_opens_as_file() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile + .write_all(b"unique file data") + .expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let opened_input = + SortInputsIntoIter::open_file(tmpfile.path().to_path_buf()).expect("should open input"); + + match opened_input { + OpenedInput::File(mut file) => { + let mut contents = String::new(); + file.read_to_string(&mut contents) + .expect("should read opened file"); + assert_eq!(contents, "unique file data"); + } + other => panic!("expected opened file, got {other:?}"), + } + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_shared_memory_map_opens_as_shared_memory_map() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile + .write_all(b"duplicate data") + .expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let mut iter = SortInputsIntoIter::default(); + let opened_input = iter + .shared_memory_map(tmpfile.path().to_path_buf()) + .expect("should open shared memory map"); + + match opened_input { + OpenedInput::SharedMemoryMap { data, offset } => { + assert_eq!(offset, 0); + assert_eq!(&data[..], b"duplicate data"); + } + other => panic!("expected shared memory map, got {other:?}"), + } + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_shared_memory_maps_have_independent_offsets() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile + .write_all(b"independent offsets") + .expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let path = tmpfile.path().to_path_buf(); + let mut iter = SortInputsIntoIter::default(); + let mut first = iter + .shared_memory_map(path.clone()) + .expect("should open first shared memory map"); + let mut second = iter + .shared_memory_map(path) + .expect("should open second shared memory map"); + + let mut first_buf = [0u8; 11]; + first + .read_exact(&mut first_buf) + .expect("should read from first mmap"); + assert_eq!(&first_buf, b"independent"); + + let mut second_buf = [0u8; 11]; + second + .read_exact(&mut second_buf) + .expect("should read from second mmap"); + assert_eq!(&second_buf, b"independent"); + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_sort_inputs_into_iter_opens_each_input_kind() { + let mut open_file_input = NamedTempFile::new().expect("should create open-file input"); + open_file_input + .write_all(b"open file input") + .expect("should write open-file input"); + open_file_input + .flush() + .expect("should flush open-file input"); + + let mut shared_memory_input = + NamedTempFile::new().expect("should create shared-memory input"); + shared_memory_input + .write_all(b"shared memory input") + .expect("should write shared-memory input"); + shared_memory_input + .flush() + .expect("should flush shared-memory input"); + + let mut output = NamedTempFile::new().expect("should create output file"); + output + .write_all(b"output snapshot") + .expect("should write output file"); + output.flush().expect("should flush output file"); + let output_file = File::open(output.path()).expect("should open output for snapshot"); + let output_snapshot = + Arc::new(unsafe { MemoryMap::map(&output_file).expect("should mmap output snapshot") }); + + let inputs = SortInputs { + inputs: vec![ + SortInput { + inner: DeferredInput::Stdin, + }, + SortInput { + inner: DeferredInput::Path { + path: open_file_input.path().to_path_buf(), + access: InputAccess::OpenFile, + }, + }, + SortInput { + inner: DeferredInput::Path { + path: shared_memory_input.path().to_path_buf(), + access: InputAccess::SharedMemoryMap, + }, + }, + SortInput { + inner: DeferredInput::OutputSnapshot(output_snapshot), + }, + ], + }; + + let mut iter = inputs.into_iter(); + assert_eq!(iter.len(), 4); + + assert!(matches!( + iter.next() + .expect("should yield stdin") + .expect("stdin is valid"), + OpenedInput::Stdin + )); + assert_eq!(iter.len(), 3); + + match iter + .next() + .expect("should yield open-file input") + .expect("should open file") + { + OpenedInput::File(mut file) => { + let mut contents = String::new(); + file.read_to_string(&mut contents) + .expect("should read opened file"); + assert_eq!(contents, "open file input"); + } + other => panic!("expected opened file, got {other:?}"), + } + + match iter + .next() + .expect("should yield shared-memory input") + .expect("should open shared-memory input") + { + OpenedInput::SharedMemoryMap { data, offset } => { + assert_eq!(offset, 0); + assert_eq!(&data[..], b"shared memory input"); + } + other => panic!("expected shared memory map, got {other:?}"), + } + + match iter + .next() + .expect("should yield output snapshot") + .expect("should open output snapshot") + { + OpenedInput::SharedMemoryMap { data, offset } => { + assert_eq!(offset, 0); + assert_eq!(&data[..], b"output snapshot"); + } + other => panic!("expected output snapshot memory map, got {other:?}"), + } + + assert!(iter.next().is_none()); + } +} diff --git a/src/uu/sort/src/sort_inputs/mod.rs b/src/uu/sort/src/sort_inputs/mod.rs new file mode 100644 index 00000000000..74923e47860 --- /dev/null +++ b/src/uu/sort/src/sort_inputs/mod.rs @@ -0,0 +1,4 @@ +mod iterator; +mod model; + +pub use model::{DeferredInput, InputAccess, OpenedInput, SortInput, SortInputs}; diff --git a/src/uu/sort/src/sort_inputs/model.rs b/src/uu/sort/src/sort_inputs/model.rs new file mode 100644 index 00000000000..1ea23289bf1 --- /dev/null +++ b/src/uu/sort/src/sort_inputs/model.rs @@ -0,0 +1,521 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. +//! Input file handling for sort merge. +//! +//! Dedupes duplicate paths via memory-map and defers opening unique files until +//! iteration so `merge_with_file_limit` respects its batch size. + +use std::{ + collections::HashMap, + ffi::OsString, + fs::File, + io::{self, Read}, + path::{Path, PathBuf}, + sync::Arc, +}; + +use crate::STDIN_FILE; +use memmap2::Mmap as MemoryMap; + +/// Deferred representation of a sort input before any file descriptor is opened. +#[derive(Debug)] +pub enum DeferredInput { + Stdin, + /// A file whose open() is deferred until iteration. + Path { + path: PathBuf, + access: InputAccess, + }, + OutputSnapshot(Arc), +} + +/// Describes how a deferred path should be opened when yielded by the iterator. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum InputAccess { + OpenFile, + SharedMemoryMap, +} + +/// Concrete input opened by `SortInputsIntoIter`; this is the type consumed by merge. +#[derive(Debug)] +pub enum OpenedInput { + Stdin, + File(File), + SharedMemoryMap { data: Arc, offset: usize }, +} + +impl Read for OpenedInput { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self { + Self::File(file) => file.read(buf), + Self::SharedMemoryMap { data, offset } => { + let pos = *offset; + let available = data.len().saturating_sub(pos); + let to_read = buf.len().min(available); + + if to_read > 0 { + buf[..to_read].copy_from_slice(&data[pos..pos + to_read]); + *offset = pos + to_read; + } + + Ok(to_read) + } + Self::Stdin => { + let mut stdin = io::stdin(); + stdin.read(buf) + } + } + } +} + +/// Handle to a single sort input (file, memory-map, or stdin). +#[derive(Debug)] +pub struct SortInput { + pub inner: DeferredInput, +} + +impl SortInput { + fn stdin() -> Self { + Self { + inner: DeferredInput::Stdin, + } + } + + fn to_args_path(path: PathBuf, access: InputAccess) -> Self { + Self { + inner: DeferredInput::Path { path, access }, + } + } + + fn to_output(memory_map: Arc) -> Self { + Self { + inner: DeferredInput::OutputSnapshot(memory_map), + } + } +} + +/// Collection of sort inputs. +/// +/// Preserves argument order and multiplicity. Duplicate paths are memory-map'd once +/// and shared; unique paths are stored lazily and opened during iteration so the +/// merge batch size limits active FDs. +#[derive(Debug)] +pub struct SortInputs { + pub inputs: Vec, +} + +impl SortInputs { + /// Build a `SortInputs` from paths. + /// + /// - Duplicate paths → stored as deferred paths marked for shared memory-map access. + /// - Unique paths → stored as deferred paths marked for regular file access. + /// - `output_as_input` → stored as a pre-created memory-map snapshot. + pub fn from_files_with_output( + files: &[OsString], + output_as_input: Option<(PathBuf, Arc)>, + ) -> Self { + let mut inputs = Vec::with_capacity(files.len()); + + // First pass: count occurrences of each path to identify duplicates + let mut path_counts: HashMap = HashMap::new(); + for file in files { + if file != STDIN_FILE { + let path = Path::new(file); + let canonical = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); + *path_counts.entry(canonical).or_insert(0) += 1; + } + } + + // Second pass: build inputs + // - Unique files: deferred path opened as a regular file during iteration + // - Duplicate files: deferred path opened as a shared memory-map during iteration + // - Output-as-input: use pre-created memory-map snapshot + // - Stdin: preserve each occurrence + for file in files { + if file == STDIN_FILE { + inputs.push(SortInput::stdin()); + continue; + } + + let path = Path::new(file); + let canonical = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); + // Check if this is the output file used as input + // Then use already opened fd for output + if let Some((ref output_path, ref output_memory_map)) = output_as_input { + if canonical == *output_path { + inputs.push(SortInput::to_output(output_memory_map.clone())); + continue; + } + } + + // Unique file as input + if *path_counts.get(&canonical).unwrap_or(&0) <= 1 { + inputs.push(SortInput::to_args_path( + PathBuf::from(file), + InputAccess::OpenFile, + )); + // Duplicate input + } else { + inputs.push(SortInput::to_args_path( + PathBuf::from(file), + InputAccess::SharedMemoryMap, + )); + } + } + + Self { + inputs: inputs.into_iter().collect(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::ffi::{OsStr, OsString}; + #[cfg(not(target_os = "wasi"))] + use std::io::Write; + #[cfg(not(target_os = "wasi"))] + use tempfile::NamedTempFile; + + // Util method for SortInputs in test + impl SortInputs { + pub fn from_files(files: &[OsString]) -> Self { + Self::from_files_with_output(files, None) + } + + /// Returns the total number of inputs (including duplicates). + pub fn len(&self) -> usize { + self.inputs.len() + } + + /// Returns true if there are no inputs. + fn is_empty(&self) -> bool { + self.inputs.is_empty() + } + + /// Returns the number of unique sources (stdin + unique files + memory-map groups). + pub fn unique_count(&self) -> usize { + let mut stdin_present = false; + let mut seen_paths = std::collections::HashSet::new(); + let mut seen_memory_maps = std::collections::HashSet::new(); + + for input in &self.inputs { + match &input.inner { + DeferredInput::Stdin => { + stdin_present = true; + } + DeferredInput::Path { path, .. } => { + let canonical = path.canonicalize().unwrap_or_else(|_| path.clone()); + seen_paths.insert(canonical); + } + DeferredInput::OutputSnapshot(data) => { + seen_memory_maps.insert(Arc::as_ptr(data)); + } + } + } + + seen_paths.len() + seen_memory_maps.len() + usize::from(stdin_present) + } + + /// Iterate over the inputs without consuming them. + fn iter(&self) -> impl Iterator { + self.inputs.iter() + } + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_sort_input_new_file() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile + .write_all(b"hello world") + .expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let input = SortInput::to_args_path(tmpfile.path().to_path_buf(), InputAccess::OpenFile); + + assert!(matches!( + input.inner, + DeferredInput::Path { + access: InputAccess::OpenFile, + .. + } + )); + } + #[test] + fn test_sort_input_new_stdin() { + let input = SortInput::stdin(); + assert!(matches!(input.inner, DeferredInput::Stdin)); + } + + #[test] + fn test_sort_input_new_missing_file() { + let file = OsStr::new("/nonexistent/path/file.txt"); + + let lazy_input = SortInput::to_args_path(PathBuf::from(file), InputAccess::OpenFile); + + assert!(matches!( + lazy_input.inner, + DeferredInput::Path { + access: InputAccess::OpenFile, + .. + } + )); + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_sort_input_memory_map_read() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile + .write_all(b"memory_map test data") + .expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let file = File::open(tmpfile.path()).expect("should open temp file"); + let memory_map = + Arc::new(unsafe { MemoryMap::map(&file).expect("should memory_map temp file") }); + let mut input = OpenedInput::SharedMemoryMap { + data: memory_map, + offset: 0, + }; + let mut buf = [0u8; 20]; + let n = input.read(&mut buf).expect("should read from input"); + assert_eq!(n, 20); + assert_eq!(&buf, b"memory_map test data"); + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_sort_input_into_box_read() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile + .write_all(b"test data") + .expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let file = File::open(tmpfile.path()).expect("should open temp file"); + let mmap = Arc::new(unsafe { MemoryMap::map(&file).expect("should mmap temp file") }); + let input = OpenedInput::SharedMemoryMap { + data: mmap, + offset: 0, + }; + let mut reader: Box = Box::new(input); + let mut buf = [0u8; 9]; + let n = reader + .read(&mut buf) + .expect("should read from boxed reader"); + assert_eq!(n, 9); + assert_eq!(&buf, b"test data"); + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_sort_input_mmap_independent_reads() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile + .write_all(b"independent reads") + .expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let file = File::open(tmpfile.path()).expect("should open temp file"); + let mmap = Arc::new(unsafe { MemoryMap::map(&file).expect("should mmap temp file") }); + + let mut input1 = OpenedInput::SharedMemoryMap { + data: mmap.clone(), + offset: 0, + }; + let mut input2 = OpenedInput::SharedMemoryMap { + data: mmap, + offset: 0, + }; + + // Both should be able to read independently + let mut buf1 = [0u8; 11]; + input1 + .read_exact(&mut buf1) + .expect("should read from first input"); + assert_eq!(&buf1, b"independent"); + + let mut buf2 = [0u8; 11]; + input2 + .read_exact(&mut buf2) + .expect("should read from second input"); + assert_eq!(&buf2, b"independent"); + } + + #[test] + fn test_sort_inputs_empty() { + let inputs = SortInputs::from_files(&[]); + assert_eq!(inputs.len(), 0); + assert!(inputs.is_empty()); + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_sort_inputs_single_file() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile + .write_all(b"data") + .expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let files = vec![tmpfile.path().as_os_str().to_os_string()]; + let inputs = SortInputs::from_files(&files); + assert_eq!(inputs.len(), 1); + assert_eq!(inputs.unique_count(), 1); + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_sort_inputs_multiple_unique() { + let mut tmpfile1 = NamedTempFile::new().expect("should create temp file"); + tmpfile1 + .write_all(b"data1") + .expect("should write to temp file"); + let mut tmpfile2 = NamedTempFile::new().expect("should create temp file"); + tmpfile2 + .write_all(b"data2") + .expect("should write to temp file"); + let mut tmpfile3 = NamedTempFile::new().expect("should create temp file"); + tmpfile3 + .write_all(b"data3") + .expect("should write to temp file"); + + let files = vec![ + tmpfile1.path().as_os_str().to_os_string(), + tmpfile2.path().as_os_str().to_os_string(), + tmpfile3.path().as_os_str().to_os_string(), + ]; + let inputs = SortInputs::from_files(&files); + assert_eq!(inputs.len(), 3); + assert_eq!(inputs.unique_count(), 3); + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_sort_inputs_with_duplicates() { + let mut tmpfile1 = NamedTempFile::new().expect("should create temp file"); + tmpfile1 + .write_all(b"data1") + .expect("should write to temp file"); + let mut tmpfile2 = NamedTempFile::new().expect("should create temp file"); + tmpfile2 + .write_all(b"data2") + .expect("should write to temp file"); + + let files = vec![ + tmpfile1.path().as_os_str().to_os_string(), + tmpfile1.path().as_os_str().to_os_string(), + tmpfile2.path().as_os_str().to_os_string(), + ]; + let inputs = SortInputs::from_files(&files); + assert_eq!(inputs.len(), 3); + // 2 unique: file1 (mmap) and file2 (direct) + assert_eq!(inputs.unique_count(), 2); + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_sort_inputs_duplicate_mmap_independent() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile + .write_all(b"independent reads") + .expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let files = vec![ + tmpfile.path().as_os_str().to_os_string(), + tmpfile.path().as_os_str().to_os_string(), + ]; + let inputs = SortInputs::from_files(&files); + + assert_eq!(inputs.len(), 2); + + for input in inputs.iter() { + assert!(matches!( + input.inner, + DeferredInput::Path { + access: InputAccess::SharedMemoryMap, + .. + } + )); + } + } + + #[test] + fn test_sort_inputs_stdin_only() { + let files = vec![OsString::from("-")]; + let inputs = SortInputs::from_files(&files); + let input = inputs.iter().next().expect("should get first input"); + assert_eq!(inputs.len(), 1); + assert!(matches!(input.inner, DeferredInput::Stdin)); + } + + #[test] + fn test_sort_inputs_duplicate_stdin_allowed() { + // Verify that duplicate stdin is allowed (GNU Coreutils compatible) + let files = vec![OsString::from("-"), OsString::from("-")]; + let inputs = SortInputs::from_files(&files); + assert_eq!(inputs.len(), files.len()); + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_sort_inputs_mixed_stdin_and_files_allowed() { + // Verify that mixing stdin with files is allowed (GNU Coreutils compatible) + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile + .write_all(b"data") + .expect("should write to temp file"); + + let files = vec![ + OsString::from("-"), + tmpfile.path().as_os_str().to_os_string(), + ]; + let inputs = SortInputs::from_files(&files); + assert_eq!(inputs.len(), files.len()); + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_sort_inputs_order_preserved() { + let mut tmpfile1 = NamedTempFile::new().expect("should create temp file"); + tmpfile1 + .write_all(b"data1") + .expect("should write to temp file"); + let mut tmpfile2 = NamedTempFile::new().expect("should create temp file"); + tmpfile2 + .write_all(b"data2") + .expect("should write to temp file"); + + let files = vec![ + tmpfile2.path().as_os_str().to_os_string(), + tmpfile1.path().as_os_str().to_os_string(), + ]; + let inputs = SortInputs::from_files(&files); + let collected: Vec<_> = inputs.iter().collect(); + assert_eq!(collected.len(), 2); + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_sort_inputs_from_files_error() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile + .write_all(b"data") + .expect("should write to temp file"); + + let files = vec![ + tmpfile.path().as_os_str().to_os_string(), + OsString::from("/nonexistent/path/file.txt"), + ]; + let inputs = SortInputs::from_files(&files); + let mut iter = inputs.into_iter(); + assert!(iter.next().expect("should get first input").is_ok()); // first file opens successfully + assert!(iter.next().expect("should get second input").is_err()); // second file fails to open + } +} diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index e9dc9bfa6f6..90fa6a3620c 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -1169,6 +1169,70 @@ fn test_merge_reversed() { .stdout_only_fixture("merge_ints_reversed.expected"); } +#[test] +fn test_merge_duplicate_files() { + let (at, mut ucmd) = at_and_ucmd!(); + at.write("merge_duplicates_1.txt", "1\n3\n5\n"); + // Test that merging the same file twice produces correct output + // This verifies FD reuse via SortInputs + ucmd.arg("-m") + .arg("merge_duplicates_1.txt") + .arg("merge_duplicates_1.txt") + .succeeds() + .stdout_is("1\n1\n3\n3\n5\n5\n"); +} + +#[test] +fn test_merge_duplicate_files_interleaved() { + let (at, mut ucmd) = at_and_ucmd!(); + at.write("merge_duplicates_1.txt", "1\n3\n5\n"); + at.write("merge_duplicates_2.txt", "2\n4\n6\n"); + // Test merging file1, file2, file1 - same file appears twice + ucmd.arg("-m") + .arg("merge_duplicates_1.txt") + .arg("merge_duplicates_2.txt") + .arg("merge_duplicates_1.txt") + .succeeds() + .stdout_is("1\n1\n2\n3\n3\n4\n5\n5\n6\n"); +} + +#[test] +fn test_merge_single_stdin() { + // Test that single stdin works with -m + new_ucmd!() + .arg("-m") + .arg("-") + .pipe_in("1\n3\n5\n") + .succeeds() + .stdout_only("1\n3\n5\n"); +} +#[test] +fn test_merge_duplicate_stdin() { + // Verify that duplicate stdin is allowed (GNU Coreutils compatible) + // Note: stdin is a single stream, so duplicate '-' reads the same stream. + // The first read gets the data, subsequent reads get EOF (empty). + new_ucmd!() + .arg("-m") + .arg("-") + .arg("-") + .pipe_in("1\n3\n5\n") + .succeeds() + .stdout_only("1\n3\n5\n"); +} + +#[test] +fn test_merge_mixed_stdin_and_files() { + let (at, mut ucmd) = at_and_ucmd!(); + at.write("merge_duplicates_1.txt", "1\n3\n5\n"); + // Verify that sort -m allows mixing stdin with files (GNU Coreutils compatible) + ucmd.arg("-m") + .arg("-") + .arg("merge_duplicates_1.txt") + .pipe_in("apricot\nelderberry\nkiwi\n") + .succeeds() + .stdout_is("1\n3\n5\napricot\nelderberry\nkiwi\n"); +} + #[test] fn test_pipe() { // TODO: issue 1608 reports a panic when we attempt to read from stdin,