From ae89ca3a54ecb74b269f88a95bae6519d1104cd2 Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Thu, 23 Apr 2026 16:11:46 +0200 Subject: [PATCH 01/19] sort: deduplicate FDs and defer opening unique files during merge New tests are added for merging duplicate files. --- Cargo.lock | 1 + src/uu/sort/Cargo.toml | 1 + src/uu/sort/src/merge.rs | 141 +++-- src/uu/sort/src/sort.rs | 1 + src/uu/sort/src/sort_input.rs | 559 ++++++++++++++++++ tests/by-util/test_sort.rs | 60 ++ tests/fixtures/sort/merge_duplicates_1.txt | 3 + .../sort/merge_duplicates_1_1.expected | 6 + .../sort/merge_duplicates_1_1_2.expected | 8 + .../sort/merge_duplicates_1_2_1.expected | 9 + tests/fixtures/sort/merge_duplicates_2.txt | 3 + .../fixtures/sort/merge_mixed_stdin.expected | 6 + 12 files changed, 763 insertions(+), 35 deletions(-) create mode 100644 src/uu/sort/src/sort_input.rs create mode 100644 tests/fixtures/sort/merge_duplicates_1.txt create mode 100644 tests/fixtures/sort/merge_duplicates_1_1.expected create mode 100644 tests/fixtures/sort/merge_duplicates_1_1_2.expected create mode 100644 tests/fixtures/sort/merge_duplicates_1_2_1.expected create mode 100644 tests/fixtures/sort/merge_duplicates_2.txt create mode 100644 tests/fixtures/sort/merge_mixed_stdin.expected diff --git a/Cargo.lock b/Cargo.lock index dd9f31c6541..dec1df3017e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4121,6 +4121,7 @@ dependencies = [ "itertools 0.14.0", "libc", "memchr", + "memmap2", "rand 0.10.1", "rayon", "rustix", diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index db51104c096..707daac201f 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -31,6 +31,7 @@ clap = { workspace = true } compare = { workspace = true } itertools = { workspace = true } memchr = { workspace = true } +memmap2 = { workspace = true } rand = { workspace = true } rayon = { workspace = true } self_cell = { workspace = true } diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index 85276fce6d8..8dc403951ad 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -13,55 +13,32 @@ use std::{ cmp::Ordering, - ffi::{OsStr, OsString}, + ffi::OsString, fs::{self, File}, io::{BufWriter, Read, Write}, iter, path::{Path, PathBuf}, process::{Child, ChildStdin, ChildStdout, Command, Stdio}, rc::Rc, - sync::mpsc::{Receiver, Sender, SyncSender, channel, sync_channel}, + sync::{ + Arc, + mpsc::{Receiver, Sender, SyncSender, channel, sync_channel}, + }, thread::{self, JoinHandle}, }; use compare::Compare; +use memmap2::Mmap; use uucore::error::{FromIo, UResult}; use crate::{ GlobalSettings, Output, SortError, chunks::{self, Chunk, RecycledChunk}, - compare_by, current_open_fd_count, fd_soft_limit, open, + compare_by, current_open_fd_count, fd_soft_limit, + sort_input::{SortInput, SortInputs}, tmp_dir::TmpDirWrapper, }; -/// If the output file occurs in the input files as well, copy the contents of the output file -/// and replace its occurrences in the inputs with that copy. -fn replace_output_file_in_input_files( - files: &mut [OsString], - output: Option<&OsStr>, - tmp_dir: &mut TmpDirWrapper, -) -> UResult<()> { - let mut copy: Option = None; - if let Some(Ok(output_path)) = output.map(|path| Path::new(path).canonicalize()) { - for file in files { - if let Ok(file_path) = Path::new(file).canonicalize() { - if file_path == output_path { - if let Some(copy) = © { - *file = copy.clone().into_os_string(); - } else { - let (_file, copy_path) = tmp_dir.next_file()?; - fs::copy(file_path, ©_path) - .map_err(|error| SortError::OpenTmpFileFailed { error })?; - *file = copy_path.clone().into_os_string(); - copy = Some(copy_path); - } - } - } - } - } - Ok(()) -} - /// Determine the effective merge batch size, enforcing a minimum and respecting the /// file-descriptor soft limit after reserving stdio/output and a safety margin. fn effective_merge_batch_size(settings: &GlobalSettings) -> usize { @@ -99,10 +76,45 @@ pub fn merge( output: Output, tmp_dir: &mut TmpDirWrapper, ) -> UResult<()> { - replace_output_file_in_input_files(files, output.as_output_name(), tmp_dir)?; - let files = files - .iter() - .map(|file| open(file).map(|file| PlainMergeInput { inner: file })); + // If the output file is also listed as an input, pre-mmap it before + // it gets opened for writing. This allows reading the original content + // via mmap while writing to the same file, without needing a temp copy. + let output_as_input = if let Some(name) = output.as_output_name() { + let output_path = Path::new(name).canonicalize()?; + let appears = files.iter().any(|f| { + Path::new(f) + .canonicalize() + .map(|p| p == output_path) + .unwrap_or(false) + }); + if appears { + let read_fd = File::open(name).map_err(|error| SortError::ReadFailed { + path: output_path.clone(), + error, + })?; + // SAFETY: We keep the read_fd open for the lifetime of the mmap, + // and we only read from it. The file is not modified while the + // mmap exists (writing happens later via a separate FD). + let mmap = Arc::new(unsafe { Mmap::map(&read_fd) }.map_err(|error| { + SortError::ReadFailed { + path: output_path.clone(), + error, + } + })?); + Some((output_path, mmap)) + } else { + None + } + } else { + None + }; + + let sort_inputs = SortInputs::from_files_with_output(files, output_as_input)?; + let files = sort_inputs.into_iter().map(|result| { + result.map(|input| PlainMergeInput:: { + inner: input, + }) + }); if settings.compress_prog.is_none() { merge_with_file_limit::<_, _, WriteablePlainTmpFile>(files, settings, output, tmp_dir) } else { @@ -596,3 +608,62 @@ impl MergeInput for PlainMergeInput { &mut self.inner } } + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + use tempfile::NamedTempFile; + + #[test] + fn test_merge_output_as_input() { + // Setup: output file 'out' contains "6\n", inputs in/1..in/5 contain 1..5 + let mut out = NamedTempFile::new().unwrap(); + out.write_all(b"6\n").unwrap(); + out.flush().unwrap(); + let out_path = out.path().as_os_str().to_os_string(); + + let inputs: Vec = (1..=5) + .map(|i| { + let mut f = NamedTempFile::new().unwrap(); + writeln!(f, "{}", i).unwrap(); + f.flush().unwrap(); + f + }) + .collect(); + + let files = vec![ + out_path.clone(), + inputs[0].path().as_os_str().to_os_string(), + inputs[1].path().as_os_str().to_os_string(), + inputs[2].path().as_os_str().to_os_string(), + inputs[3].path().as_os_str().to_os_string(), + inputs[4].path().as_os_str().to_os_string(), + out_path.clone(), + ]; + + // Check Opened SortInputs: 7 inputs but only 6 unique sources + let output_canon = out.path().canonicalize().unwrap(); + let read_fd = File::open(out.path()).unwrap(); + let output_mmap = Arc::new(unsafe { Mmap::map(&read_fd).unwrap() }); + + let sort_inputs = + SortInputs::from_files_with_output(&files, Some((output_canon, output_mmap))).unwrap(); + + assert_eq!(sort_inputs.len(), 7); + assert_eq!(sort_inputs.unique_count(), 6); + + // Run merge + let mut files_mut = files.clone(); + let settings = GlobalSettings::default(); + let output = Output::new(Some(out_path.as_os_str())).unwrap(); + let mut tmp_dir = TmpDirWrapper::new(std::env::temp_dir()); + + merge(&mut files_mut, &settings, output, &mut tmp_dir).unwrap(); + + // If merge succeeded with only 6 unique sources, mmap deduplication worked. + // Verify correctness. + let result = std::fs::read_to_string(out.path()).unwrap(); + assert_eq!(result, "1\n2\n3\n4\n5\n6\n6\n"); + } +} diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index d27f6eadb1a..81f26e89081 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -17,6 +17,7 @@ mod custom_str_cmp; mod ext_sort; mod merge; mod numeric_str_cmp; +mod sort_input; mod tmp_dir; use bigdecimal::BigDecimal; diff --git a/src/uu/sort/src/sort_input.rs b/src/uu/sort/src/sort_input.rs new file mode 100644 index 00000000000..dd9184f4962 --- /dev/null +++ b/src/uu/sort/src/sort_input.rs @@ -0,0 +1,559 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. +//! Input file handling for sort merge. +//! +//! Dedupes duplicate paths via mmap and defers opening unique files until +//! iteration so `merge_with_file_limit` respects its batch size. + +use std::{ + collections::HashMap, + ffi::OsStr, + fs::File, + io::{self, Read}, + path::{Path, PathBuf}, + sync::{ + Arc, + atomic::{AtomicUsize, Ordering}, + }, +}; + +use memmap2::Mmap; +use uucore::error::UResult; + +use crate::{STDIN_FILE, SortError}; + +/// An inner enum representing the actual source of a sort input. +#[derive(Debug)] +enum SortInputInner { + /// An already-opened regular file. + File(File), + /// A memory-mapped file shared across duplicate paths, with an + /// independent cursor per instance. + Mmap { + data: Arc, + offset: AtomicUsize, + }, + Stdin, + /// A unique file whose open() is deferred until iteration. + LazyFile(PathBuf), +} + +/// Handle to a single sort input (file, mmap, or stdin). +#[derive(Debug)] +pub struct SortInput { + inner: SortInputInner, +} + +impl SortInput { + /// Open a path directly (stdin if `"-"`). + pub fn new(path: &OsStr) -> UResult { + if path == STDIN_FILE { + return Ok(Self { + inner: SortInputInner::Stdin, + }); + } + + let path = Path::new(path); + match File::open(path) { + Ok(f) => Ok(Self { + inner: SortInputInner::File(f), + }), + Err(error) => Err(SortError::ReadFailed { + path: path.to_owned(), + error, + } + .into()), + } + } + + fn from_mmap(data: Arc) -> Self { + Self { + inner: SortInputInner::Mmap { + data, + offset: AtomicUsize::new(0), + }, + } + } + + /// Returns true if this input is stdin. + pub fn is_stdin(&self) -> bool { + matches!(self.inner, SortInputInner::Stdin) + } +} + +impl Read for SortInput { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + // Open LazyFile on first read (fallback if iterator didn't do it). + if let SortInputInner::LazyFile(path) = &self.inner { + let file = File::open(path)?; + self.inner = SortInputInner::File(file); + } + + match &mut self.inner { + SortInputInner::File(file) => file.read(buf), + SortInputInner::Mmap { data, offset } => { + let pos = offset.load(Ordering::Relaxed); + let available = data.len().saturating_sub(pos); + let to_read = buf.len().min(available); + if to_read > 0 { + buf[..to_read].copy_from_slice(&data[pos..pos + to_read]); + offset.fetch_add(to_read, Ordering::Relaxed); + } + Ok(to_read) + } + SortInputInner::Stdin => { + let mut stdin = io::stdin(); + stdin.read(buf) + } + SortInputInner::LazyFile(_) => unreachable!(), + } + } +} + +impl From for Box { + fn from(input: SortInput) -> Self { + Box::new(input) as Box + } +} + +/// Collection of sort inputs. +/// +/// Preserves argument order and multiplicity. Duplicate paths are mmap'd once +/// and shared; unique paths are stored lazily and opened during iteration so the +/// merge batch size limits active FDs. +#[derive(Debug)] +pub struct SortInputs { + inputs: Vec, +} + +impl SortInputs { + #[allow(dead_code)] + // Used only in unit tests. + pub fn from_files(files: &[std::ffi::OsString]) -> UResult { + Self::from_files_with_output(files, None) + } + + /// Build a `SortInputs` from paths. + /// + /// - Duplicate paths → one mmap shared across all instances. + /// - Unique paths → stored as `LazyFile` and opened when the iterator yields them. + /// - `output_as_input`: pre-created mmap used when the output file is also an input. + pub fn from_files_with_output( + files: &[std::ffi::OsString], + output_as_input: Option<(PathBuf, Arc)>, + ) -> UResult { + let mut inputs = Vec::with_capacity(files.len()); + + // First pass: count occurrences of each path to identify duplicates + let mut path_counts: HashMap = HashMap::new(); + for file in files { + if file != STDIN_FILE { + let path = Path::new(file); + let canonical = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); + *path_counts.entry(canonical).or_insert(0) += 1; + } + } + + // Second pass: build inputs + // - Unique files: LazyFile (opened during iteration) + // - Duplicate files: mmap once, share it + // - Output-as-input: use pre-created mmap + // - Stdin: single occurrence + let mut opened_files: HashMap> = HashMap::new(); + + for file in files { + if file == STDIN_FILE { + inputs.push(SortInput { + inner: SortInputInner::Stdin, + }); + } else { + let path = Path::new(file); + let canonical = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); + + // Check if this is the output file used as input + if let Some((ref output_path, ref output_mmap)) = output_as_input { + if canonical == *output_path { + inputs.push(SortInput::from_mmap(output_mmap.clone())); + continue; + } + } + + if *path_counts.get(&canonical).unwrap_or(&0) > 1 { + // Duplicate file: use mmap + let mmap = if let Some(mmap) = opened_files.get(&canonical) { + mmap.clone() + } else { + let f = File::open(path).map_err(|error| SortError::ReadFailed { + path: path.to_owned(), + error, + })?; + // SAFETY: We keep the file open for the lifetime of the mmap, + // and we only read from it. The file is not modified. + let mmap = Arc::new(unsafe { Mmap::map(&f) }.map_err(|error| { + SortError::ReadFailed { + path: path.to_owned(), + error, + } + })?); + opened_files.insert(canonical, mmap.clone()); + mmap + }; + inputs.push(SortInput::from_mmap(mmap)); + } else { + // Unique file: defer opening until iteration so that + // merge_with_file_limit can respect its batch_size and + // avoid exceeding the file-descriptor soft limit. + inputs.push(SortInput { + inner: SortInputInner::LazyFile(path.to_path_buf()), + }); + } + } + } + + Ok(Self { inputs }) + } + + /// Returns the total number of inputs (including duplicates). + #[allow(dead_code)] + pub fn len(&self) -> usize { + self.inputs.len() + } + + /// Returns true if there are no inputs. + #[allow(dead_code)] + pub fn is_empty(&self) -> bool { + self.inputs.is_empty() + } + + /// Returns the number of unique sources (stdin + unique files + mmap groups). + #[allow(dead_code)] + pub fn unique_count(&self) -> usize { + let mut file_count = 0; + let mut stdin_present = false; + + // Count unique mmap instances by Arc pointer + let mut seen_mmaps = std::collections::HashSet::new(); + + for input in &self.inputs { + match &input.inner { + SortInputInner::Stdin => { + stdin_present = true; + } + SortInputInner::File(_) | SortInputInner::LazyFile(_) => { + file_count += 1; + } + SortInputInner::Mmap { data, .. } => { + seen_mmaps.insert(Arc::as_ptr(data)); + } + } + } + + file_count + seen_mmaps.len() + usize::from(stdin_present) + } + + /// Iterate over the inputs without consuming them. + #[allow(dead_code)] + pub fn iter(&self) -> impl Iterator { + self.inputs.iter() + } +} + +/// Iterator that opens LazyFile entries as they are yielded. +#[derive(Debug)] +pub struct SortInputsIntoIter { + inner: std::vec::IntoIter, +} + +impl Iterator for SortInputsIntoIter { + type Item = UResult; + + fn next(&mut self) -> Option { + let mut input = self.inner.next()?; + + // Convert LazyFile to File so errors surface during iteration. + if let SortInputInner::LazyFile(path) = &input.inner { + match File::open(path) { + Ok(file) => input.inner = SortInputInner::File(file), + Err(error) => { + return Some(Err(SortError::ReadFailed { path: path.clone(), error }.into())); + } + } + } + Some(Ok(input)) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +impl ExactSizeIterator for SortInputsIntoIter { + fn len(&self) -> usize { + self.inner.len() + } +} + +impl IntoIterator for SortInputs { + type Item = UResult; + type IntoIter = SortInputsIntoIter; + + fn into_iter(self) -> Self::IntoIter { + SortInputsIntoIter { + inner: self.inputs.into_iter(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::ffi::OsString; + use std::io::Write; + use tempfile::NamedTempFile; + + #[test] + fn test_sort_input_new_file() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile.write_all(b"hello world").expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let input = SortInput::new(tmpfile.path().as_os_str()).expect("should create sort input"); + assert!(!input.is_stdin()); + } + + #[test] + fn test_sort_input_new_stdin() { + let input = SortInput::new(OsStr::new("-")).expect("should create sort input for stdin"); + assert!(input.is_stdin()); + } + + #[test] + fn test_sort_input_new_missing_file() { + let result = SortInput::new(OsStr::new("/nonexistent/path/file.txt")); + assert!(result.is_err()); + } + + #[test] + fn test_sort_input_mmap_read() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile.write_all(b"mmap test data").expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let file = File::open(tmpfile.path()).expect("should open temp file"); + let mmap = Arc::new(unsafe { Mmap::map(&file).expect("should mmap temp file") }); + let mut input = SortInput::from_mmap(mmap); + + let mut buf = [0u8; 14]; + let n = input.read(&mut buf).expect("should read from input"); + assert_eq!(n, 14); + assert_eq!(&buf, b"mmap test data"); + } + + #[test] + fn test_sort_input_into_box_read() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile.write_all(b"test data").expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let file = File::open(tmpfile.path()).expect("should open temp file"); + let mmap = Arc::new(unsafe { Mmap::map(&file).expect("should mmap temp file") }); + let input = SortInput::from_mmap(mmap); + let mut reader: Box = input.into(); + let mut buf = [0u8; 9]; + let n = reader.read(&mut buf).expect("should read from boxed reader"); + assert_eq!(n, 9); + assert_eq!(&buf, b"test data"); + } + + #[test] + fn test_sort_input_mmap_independent_reads() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile.write_all(b"independent reads").expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let file = File::open(tmpfile.path()).expect("should open temp file"); + let mmap = Arc::new(unsafe { Mmap::map(&file).expect("should mmap temp file") }); + + let mut input1 = SortInput::from_mmap(mmap.clone()); + let mut input2 = SortInput::from_mmap(mmap); + + // Both should be able to read independently + let mut buf1 = [0u8; 11]; + input1.read(&mut buf1).expect("should read from first input"); + assert_eq!(&buf1, b"independent"); + + let mut buf2 = [0u8; 11]; + input2.read(&mut buf2).expect("should read from second input"); + assert_eq!(&buf2, b"independent"); + } + + #[test] + fn test_sort_inputs_empty() { + let inputs = SortInputs::from_files(&[]).expect("should build empty sort inputs"); + assert_eq!(inputs.len(), 0); + assert!(inputs.is_empty()); + } + + #[test] + fn test_sort_inputs_single_file() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile.write_all(b"data").expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let files = vec![tmpfile.path().as_os_str().to_os_string()]; + let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + assert_eq!(inputs.len(), 1); + assert_eq!(inputs.unique_count(), 1); + } + + #[test] + fn test_sort_inputs_multiple_unique() { + let mut tmpfile1 = NamedTempFile::new().expect("should create temp file"); + tmpfile1.write_all(b"data1").expect("should write to temp file"); + let mut tmpfile2 = NamedTempFile::new().expect("should create temp file"); + tmpfile2.write_all(b"data2").expect("should write to temp file"); + let mut tmpfile3 = NamedTempFile::new().expect("should create temp file"); + tmpfile3.write_all(b"data3").expect("should write to temp file"); + + let files = vec![ + tmpfile1.path().as_os_str().to_os_string(), + tmpfile2.path().as_os_str().to_os_string(), + tmpfile3.path().as_os_str().to_os_string(), + ]; + let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + assert_eq!(inputs.len(), 3); + assert_eq!(inputs.unique_count(), 3); + } + + #[test] + fn test_sort_inputs_with_duplicates() { + let mut tmpfile1 = NamedTempFile::new().expect("should create temp file"); + tmpfile1.write_all(b"data1").expect("should write to temp file"); + let mut tmpfile2 = NamedTempFile::new().expect("should create temp file"); + tmpfile2.write_all(b"data2").expect("should write to temp file"); + + let files = vec![ + tmpfile1.path().as_os_str().to_os_string(), + tmpfile1.path().as_os_str().to_os_string(), + tmpfile2.path().as_os_str().to_os_string(), + ]; + let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + assert_eq!(inputs.len(), 3); + // 2 unique: file1 (mmap) and file2 (direct) + assert_eq!(inputs.unique_count(), 2); + } + + #[test] + fn test_sort_inputs_duplicate_mmap_independent() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile.write_all(b"independent reads").expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let files = vec![ + tmpfile.path().as_os_str().to_os_string(), + tmpfile.path().as_os_str().to_os_string(), + ]; + let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + + // Both inputs should be able to read independently + let mut buf1 = [0u8; 11]; + let mut input1 = SortInput { + inner: match &inputs.iter().nth(0).expect("should get first input").inner { + SortInputInner::Mmap { data, offset } => SortInputInner::Mmap { + data: data.clone(), + offset: AtomicUsize::new(offset.load(Ordering::Relaxed)), + }, + _ => panic!("Expected mmap"), + }, + }; + input1.read(&mut buf1).expect("should read from first input"); + assert_eq!(&buf1, b"independent"); + + let mut buf2 = [0u8; 11]; + let mut input2 = SortInput { + inner: match &inputs.iter().nth(1).expect("should get second input").inner { + SortInputInner::Mmap { data, offset } => SortInputInner::Mmap { + data: data.clone(), + offset: AtomicUsize::new(offset.load(Ordering::Relaxed)), + }, + _ => panic!("Expected mmap"), + }, + }; + input2.read(&mut buf2).expect("should read from second input"); + assert_eq!(&buf2, b"independent"); + } + + #[test] + fn test_sort_inputs_stdin_only() { + let files = vec![OsString::from("-")]; + let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + assert_eq!(inputs.len(), 1); + assert!(inputs.iter().next().expect("should get first input").is_stdin()); + } + + #[test] + fn test_sort_inputs_duplicate_stdin_allowed() { + // Verify that duplicate stdin is allowed (GNU Coreutils compatible) + let files = vec![OsString::from("-"), OsString::from("-")]; + let result = SortInputs::from_files(&files); + assert!(result.is_ok()); + } + + #[test] + fn test_sort_inputs_mixed_stdin_and_files_allowed() { + // Verify that mixing stdin with files is allowed (GNU Coreutils compatible) + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile.write_all(b"data").expect("should write to temp file"); + + let files = vec![ + OsString::from("-"), + tmpfile.path().as_os_str().to_os_string(), + ]; + let result = SortInputs::from_files(&files); + assert!(result.is_ok()); + } + + #[test] + fn test_sort_inputs_order_preserved() { + let mut tmpfile1 = NamedTempFile::new().expect("should create temp file"); + tmpfile1.write_all(b"data1").expect("should write to temp file"); + let mut tmpfile2 = NamedTempFile::new().expect("should create temp file"); + tmpfile2.write_all(b"data2").expect("should write to temp file"); + + let files = vec![ + tmpfile2.path().as_os_str().to_os_string(), + tmpfile1.path().as_os_str().to_os_string(), + ]; + let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + let collected: Vec<_> = inputs.iter().collect(); + assert_eq!(collected.len(), 2); + } + + #[test] + fn test_sort_inputs_from_files_error() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile.write_all(b"data").expect("should write to temp file"); + + let files = vec![ + tmpfile.path().as_os_str().to_os_string(), + OsString::from("/nonexistent/path/file.txt"), + ]; + let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + let mut iter = inputs.into_iter(); + assert!(iter.next().expect("should get first input").is_ok()); // first file opens successfully + assert!(iter.next().expect("should get second input").is_err()); // second file fails to open + } + + #[test] + fn test_sort_inputs_into_iter() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile.write_all(b"data").expect("should write to temp file"); + + let files = vec![tmpfile.path().as_os_str().to_os_string()]; + let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + let count = inputs.into_iter().count(); + assert_eq!(count, 1); + } +} diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index e9dc9bfa6f6..2b14ef0b6a7 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -1169,6 +1169,66 @@ fn test_merge_reversed() { .stdout_only_fixture("merge_ints_reversed.expected"); } +#[test] +fn test_merge_duplicate_files() { + // Test that merging the same file twice produces correct output + // This verifies FD reuse via SortInputs + new_ucmd!() + .arg("-m") + .arg("merge_duplicates_1.txt") + .arg("merge_duplicates_1.txt") + .succeeds() + .stdout_only_fixture("merge_duplicates_1_1.expected"); +} + +#[test] +fn test_merge_duplicate_files_interleaved() { + // Test merging file1, file2, file1 - same file appears twice + new_ucmd!() + .arg("-m") + .arg("merge_duplicates_1.txt") + .arg("merge_duplicates_2.txt") + .arg("merge_duplicates_1.txt") + .succeeds() + .stdout_only_fixture("merge_duplicates_1_2_1.expected"); +} + +#[test] +fn test_merge_single_stdin() { + // Test that single stdin works with -m + new_ucmd!() + .arg("-m") + .arg("-") + .pipe_in("1\n3\n5\n") + .succeeds() + .stdout_only("1\n3\n5\n"); +} +#[test] +fn test_merge_duplicate_stdin() { + // Verify that duplicate stdin is allowed (GNU Coreutils compatible) + // Note: stdin is a single stream, so duplicate '-' reads the same stream. + // The first read gets the data, subsequent reads get EOF (empty). + new_ucmd!() + .arg("-m") + .arg("-") + .arg("-") + .pipe_in("1\n3\n5\n") + .succeeds() + .stdout_only("1\n3\n5\n"); +} + +#[test] +fn test_merge_mixed_stdin_and_files() { + // Verify that sort -m allows mixing stdin with files (GNU Coreutils compatible) + new_ucmd!() + .arg("-m") + .arg("-") + .arg("merge_duplicates_1.txt") + .pipe_in("apricot\nelderberry\nkiwi\n") + .succeeds() + .stdout_only_fixture("merge_mixed_stdin.expected"); +} + #[test] fn test_pipe() { // TODO: issue 1608 reports a panic when we attempt to read from stdin, diff --git a/tests/fixtures/sort/merge_duplicates_1.txt b/tests/fixtures/sort/merge_duplicates_1.txt new file mode 100644 index 00000000000..403ec31f0a1 --- /dev/null +++ b/tests/fixtures/sort/merge_duplicates_1.txt @@ -0,0 +1,3 @@ +1 +3 +5 diff --git a/tests/fixtures/sort/merge_duplicates_1_1.expected b/tests/fixtures/sort/merge_duplicates_1_1.expected new file mode 100644 index 00000000000..03e6674d634 --- /dev/null +++ b/tests/fixtures/sort/merge_duplicates_1_1.expected @@ -0,0 +1,6 @@ +1 +1 +3 +3 +5 +5 diff --git a/tests/fixtures/sort/merge_duplicates_1_1_2.expected b/tests/fixtures/sort/merge_duplicates_1_1_2.expected new file mode 100644 index 00000000000..f51c0cf2211 --- /dev/null +++ b/tests/fixtures/sort/merge_duplicates_1_1_2.expected @@ -0,0 +1,8 @@ +1 +1 +2 +3 +4 +5 +5 +6 diff --git a/tests/fixtures/sort/merge_duplicates_1_2_1.expected b/tests/fixtures/sort/merge_duplicates_1_2_1.expected new file mode 100644 index 00000000000..2af073d821b --- /dev/null +++ b/tests/fixtures/sort/merge_duplicates_1_2_1.expected @@ -0,0 +1,9 @@ +1 +1 +2 +3 +3 +4 +5 +5 +6 diff --git a/tests/fixtures/sort/merge_duplicates_2.txt b/tests/fixtures/sort/merge_duplicates_2.txt new file mode 100644 index 00000000000..e2ba1efb18b --- /dev/null +++ b/tests/fixtures/sort/merge_duplicates_2.txt @@ -0,0 +1,3 @@ +2 +4 +6 diff --git a/tests/fixtures/sort/merge_mixed_stdin.expected b/tests/fixtures/sort/merge_mixed_stdin.expected new file mode 100644 index 00000000000..a46fadad31f --- /dev/null +++ b/tests/fixtures/sort/merge_mixed_stdin.expected @@ -0,0 +1,6 @@ +1 +3 +5 +apricot +elderberry +kiwi From 54ec44c92214c5dde30c2d3ca38f40ee5c005232 Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Thu, 23 Apr 2026 18:45:03 +0200 Subject: [PATCH 02/19] chore: Update fuz cargo.lock --- fuzz/Cargo.lock | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 8b2870c66f3..66cc4dc9069 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -1275,6 +1275,15 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "memmap2" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" +dependencies = [ + "libc", +] + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -2050,6 +2059,11 @@ dependencies = [ "itertools", "libc", "memchr", +<<<<<<< HEAD +======= + "memmap2", + "nix", +>>>>>>> 7d0a0137f (chore: Update fuz cargo.lock) "rand", "rayon", "rustix", From cb5ddd4a74da35abc1b1eb61d27ebcfe4874c17b Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:49:41 +0200 Subject: [PATCH 03/19] chore: Run cargo fmt --- src/uu/sort/src/merge.rs | 8 ++- src/uu/sort/src/sort_input.rs | 98 +++++++++++++++++++++++++++-------- 2 files changed, 78 insertions(+), 28 deletions(-) diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index 8dc403951ad..a673701a88b 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -110,11 +110,9 @@ pub fn merge( }; let sort_inputs = SortInputs::from_files_with_output(files, output_as_input)?; - let files = sort_inputs.into_iter().map(|result| { - result.map(|input| PlainMergeInput:: { - inner: input, - }) - }); + let files = sort_inputs + .into_iter() + .map(|result| result.map(|input| PlainMergeInput:: { inner: input })); if settings.compress_prog.is_none() { merge_with_file_limit::<_, _, WriteablePlainTmpFile>(files, settings, output, tmp_dir) } else { diff --git a/src/uu/sort/src/sort_input.rs b/src/uu/sort/src/sort_input.rs index dd9184f4962..044740ac921 100644 --- a/src/uu/sort/src/sort_input.rs +++ b/src/uu/sort/src/sort_input.rs @@ -277,7 +277,11 @@ impl Iterator for SortInputsIntoIter { match File::open(path) { Ok(file) => input.inner = SortInputInner::File(file), Err(error) => { - return Some(Err(SortError::ReadFailed { path: path.clone(), error }.into())); + return Some(Err(SortError::ReadFailed { + path: path.clone(), + error, + } + .into())); } } } @@ -316,7 +320,9 @@ mod tests { #[test] fn test_sort_input_new_file() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); - tmpfile.write_all(b"hello world").expect("should write to temp file"); + tmpfile + .write_all(b"hello world") + .expect("should write to temp file"); tmpfile.flush().expect("should flush temp file"); let input = SortInput::new(tmpfile.path().as_os_str()).expect("should create sort input"); @@ -338,7 +344,9 @@ mod tests { #[test] fn test_sort_input_mmap_read() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); - tmpfile.write_all(b"mmap test data").expect("should write to temp file"); + tmpfile + .write_all(b"mmap test data") + .expect("should write to temp file"); tmpfile.flush().expect("should flush temp file"); let file = File::open(tmpfile.path()).expect("should open temp file"); @@ -354,7 +362,9 @@ mod tests { #[test] fn test_sort_input_into_box_read() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); - tmpfile.write_all(b"test data").expect("should write to temp file"); + tmpfile + .write_all(b"test data") + .expect("should write to temp file"); tmpfile.flush().expect("should flush temp file"); let file = File::open(tmpfile.path()).expect("should open temp file"); @@ -362,7 +372,9 @@ mod tests { let input = SortInput::from_mmap(mmap); let mut reader: Box = input.into(); let mut buf = [0u8; 9]; - let n = reader.read(&mut buf).expect("should read from boxed reader"); + let n = reader + .read(&mut buf) + .expect("should read from boxed reader"); assert_eq!(n, 9); assert_eq!(&buf, b"test data"); } @@ -370,7 +382,9 @@ mod tests { #[test] fn test_sort_input_mmap_independent_reads() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); - tmpfile.write_all(b"independent reads").expect("should write to temp file"); + tmpfile + .write_all(b"independent reads") + .expect("should write to temp file"); tmpfile.flush().expect("should flush temp file"); let file = File::open(tmpfile.path()).expect("should open temp file"); @@ -381,11 +395,15 @@ mod tests { // Both should be able to read independently let mut buf1 = [0u8; 11]; - input1.read(&mut buf1).expect("should read from first input"); + input1 + .read(&mut buf1) + .expect("should read from first input"); assert_eq!(&buf1, b"independent"); let mut buf2 = [0u8; 11]; - input2.read(&mut buf2).expect("should read from second input"); + input2 + .read(&mut buf2) + .expect("should read from second input"); assert_eq!(&buf2, b"independent"); } @@ -399,7 +417,9 @@ mod tests { #[test] fn test_sort_inputs_single_file() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); - tmpfile.write_all(b"data").expect("should write to temp file"); + tmpfile + .write_all(b"data") + .expect("should write to temp file"); tmpfile.flush().expect("should flush temp file"); let files = vec![tmpfile.path().as_os_str().to_os_string()]; @@ -411,11 +431,17 @@ mod tests { #[test] fn test_sort_inputs_multiple_unique() { let mut tmpfile1 = NamedTempFile::new().expect("should create temp file"); - tmpfile1.write_all(b"data1").expect("should write to temp file"); + tmpfile1 + .write_all(b"data1") + .expect("should write to temp file"); let mut tmpfile2 = NamedTempFile::new().expect("should create temp file"); - tmpfile2.write_all(b"data2").expect("should write to temp file"); + tmpfile2 + .write_all(b"data2") + .expect("should write to temp file"); let mut tmpfile3 = NamedTempFile::new().expect("should create temp file"); - tmpfile3.write_all(b"data3").expect("should write to temp file"); + tmpfile3 + .write_all(b"data3") + .expect("should write to temp file"); let files = vec![ tmpfile1.path().as_os_str().to_os_string(), @@ -430,9 +456,13 @@ mod tests { #[test] fn test_sort_inputs_with_duplicates() { let mut tmpfile1 = NamedTempFile::new().expect("should create temp file"); - tmpfile1.write_all(b"data1").expect("should write to temp file"); + tmpfile1 + .write_all(b"data1") + .expect("should write to temp file"); let mut tmpfile2 = NamedTempFile::new().expect("should create temp file"); - tmpfile2.write_all(b"data2").expect("should write to temp file"); + tmpfile2 + .write_all(b"data2") + .expect("should write to temp file"); let files = vec![ tmpfile1.path().as_os_str().to_os_string(), @@ -448,7 +478,9 @@ mod tests { #[test] fn test_sort_inputs_duplicate_mmap_independent() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); - tmpfile.write_all(b"independent reads").expect("should write to temp file"); + tmpfile + .write_all(b"independent reads") + .expect("should write to temp file"); tmpfile.flush().expect("should flush temp file"); let files = vec![ @@ -468,7 +500,9 @@ mod tests { _ => panic!("Expected mmap"), }, }; - input1.read(&mut buf1).expect("should read from first input"); + input1 + .read(&mut buf1) + .expect("should read from first input"); assert_eq!(&buf1, b"independent"); let mut buf2 = [0u8; 11]; @@ -481,7 +515,9 @@ mod tests { _ => panic!("Expected mmap"), }, }; - input2.read(&mut buf2).expect("should read from second input"); + input2 + .read(&mut buf2) + .expect("should read from second input"); assert_eq!(&buf2, b"independent"); } @@ -490,7 +526,13 @@ mod tests { let files = vec![OsString::from("-")]; let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); assert_eq!(inputs.len(), 1); - assert!(inputs.iter().next().expect("should get first input").is_stdin()); + assert!( + inputs + .iter() + .next() + .expect("should get first input") + .is_stdin() + ); } #[test] @@ -505,7 +547,9 @@ mod tests { fn test_sort_inputs_mixed_stdin_and_files_allowed() { // Verify that mixing stdin with files is allowed (GNU Coreutils compatible) let mut tmpfile = NamedTempFile::new().expect("should create temp file"); - tmpfile.write_all(b"data").expect("should write to temp file"); + tmpfile + .write_all(b"data") + .expect("should write to temp file"); let files = vec![ OsString::from("-"), @@ -518,9 +562,13 @@ mod tests { #[test] fn test_sort_inputs_order_preserved() { let mut tmpfile1 = NamedTempFile::new().expect("should create temp file"); - tmpfile1.write_all(b"data1").expect("should write to temp file"); + tmpfile1 + .write_all(b"data1") + .expect("should write to temp file"); let mut tmpfile2 = NamedTempFile::new().expect("should create temp file"); - tmpfile2.write_all(b"data2").expect("should write to temp file"); + tmpfile2 + .write_all(b"data2") + .expect("should write to temp file"); let files = vec![ tmpfile2.path().as_os_str().to_os_string(), @@ -534,7 +582,9 @@ mod tests { #[test] fn test_sort_inputs_from_files_error() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); - tmpfile.write_all(b"data").expect("should write to temp file"); + tmpfile + .write_all(b"data") + .expect("should write to temp file"); let files = vec![ tmpfile.path().as_os_str().to_os_string(), @@ -549,7 +599,9 @@ mod tests { #[test] fn test_sort_inputs_into_iter() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); - tmpfile.write_all(b"data").expect("should write to temp file"); + tmpfile + .write_all(b"data") + .expect("should write to temp file"); let files = vec![tmpfile.path().as_os_str().to_os_string()]; let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); From 097a7e604b119a98115f0e54a5781aada936bbd4 Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Fri, 24 Apr 2026 08:30:58 +0200 Subject: [PATCH 04/19] chore: Fix cargo clippy feedback --- src/uu/sort/src/merge.rs | 4 ++-- src/uu/sort/src/sort_input.rs | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index a673701a88b..194a8eedef9 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -624,7 +624,7 @@ mod tests { let inputs: Vec = (1..=5) .map(|i| { let mut f = NamedTempFile::new().unwrap(); - writeln!(f, "{}", i).unwrap(); + writeln!(f, "{i}").unwrap(); f.flush().unwrap(); f }) @@ -661,7 +661,7 @@ mod tests { // If merge succeeded with only 6 unique sources, mmap deduplication worked. // Verify correctness. - let result = std::fs::read_to_string(out.path()).unwrap(); + let result = fs::read_to_string(out.path()).unwrap(); assert_eq!(result, "1\n2\n3\n4\n5\n6\n6\n"); } } diff --git a/src/uu/sort/src/sort_input.rs b/src/uu/sort/src/sort_input.rs index 044740ac921..65fbb502237 100644 --- a/src/uu/sort/src/sort_input.rs +++ b/src/uu/sort/src/sort_input.rs @@ -396,13 +396,13 @@ mod tests { // Both should be able to read independently let mut buf1 = [0u8; 11]; input1 - .read(&mut buf1) + .read_exact(&mut buf1) .expect("should read from first input"); assert_eq!(&buf1, b"independent"); let mut buf2 = [0u8; 11]; input2 - .read(&mut buf2) + .read_exact(&mut buf2) .expect("should read from second input"); assert_eq!(&buf2, b"independent"); } @@ -492,7 +492,7 @@ mod tests { // Both inputs should be able to read independently let mut buf1 = [0u8; 11]; let mut input1 = SortInput { - inner: match &inputs.iter().nth(0).expect("should get first input").inner { + inner: match &inputs.iter().next().expect("should get first input").inner { SortInputInner::Mmap { data, offset } => SortInputInner::Mmap { data: data.clone(), offset: AtomicUsize::new(offset.load(Ordering::Relaxed)), @@ -501,7 +501,7 @@ mod tests { }, }; input1 - .read(&mut buf1) + .read_exact(&mut buf1) .expect("should read from first input"); assert_eq!(&buf1, b"independent"); @@ -516,7 +516,7 @@ mod tests { }, }; input2 - .read(&mut buf2) + .read_exact(&mut buf2) .expect("should read from second input"); assert_eq!(&buf2, b"independent"); } From 79b02a1f481cadc3c68acddb5c134d0679714110 Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Fri, 24 Apr 2026 11:49:57 +0200 Subject: [PATCH 05/19] tests: Create test files on the fly --- tests/by-util/test_sort.rs | 23 +++++++++++-------- tests/fixtures/sort/merge_duplicates_1.txt | 3 --- .../sort/merge_duplicates_1_1.expected | 6 ----- .../sort/merge_duplicates_1_1_2.expected | 8 ------- .../sort/merge_duplicates_1_2_1.expected | 9 -------- tests/fixtures/sort/merge_duplicates_2.txt | 3 --- 6 files changed, 14 insertions(+), 38 deletions(-) delete mode 100644 tests/fixtures/sort/merge_duplicates_1.txt delete mode 100644 tests/fixtures/sort/merge_duplicates_1_1.expected delete mode 100644 tests/fixtures/sort/merge_duplicates_1_1_2.expected delete mode 100644 tests/fixtures/sort/merge_duplicates_1_2_1.expected delete mode 100644 tests/fixtures/sort/merge_duplicates_2.txt diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index 2b14ef0b6a7..13fce2976b8 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -1171,26 +1171,30 @@ fn test_merge_reversed() { #[test] fn test_merge_duplicate_files() { + let (at, mut ucmd) = at_and_ucmd!(); + at.write("merge_duplicates_1.txt", "1\n3\n5\n"); // Test that merging the same file twice produces correct output // This verifies FD reuse via SortInputs - new_ucmd!() - .arg("-m") + ucmd.arg("-m") .arg("merge_duplicates_1.txt") .arg("merge_duplicates_1.txt") .succeeds() - .stdout_only_fixture("merge_duplicates_1_1.expected"); + .stdout_is("1\n1\n3\n3\n5\n5\n"); } #[test] fn test_merge_duplicate_files_interleaved() { + let (at, mut ucmd) = at_and_ucmd!(); + at.write("merge_duplicates_1.txt", "1\n3\n5\n"); + at.write("merge_duplicates_2.txt", "2\n4\n6\n"); + println!("Sub sir: {}", at.as_string()); // Test merging file1, file2, file1 - same file appears twice - new_ucmd!() - .arg("-m") + ucmd.arg("-m") .arg("merge_duplicates_1.txt") .arg("merge_duplicates_2.txt") .arg("merge_duplicates_1.txt") .succeeds() - .stdout_only_fixture("merge_duplicates_1_2_1.expected"); + .stdout_is("1\n1\n2\n3\n3\n4\n5\n5\n6\n"); } #[test] @@ -1219,14 +1223,15 @@ fn test_merge_duplicate_stdin() { #[test] fn test_merge_mixed_stdin_and_files() { + let (at, mut ucmd) = at_and_ucmd!(); + at.write("merge_duplicates_1.txt", "1\n3\n5\n"); // Verify that sort -m allows mixing stdin with files (GNU Coreutils compatible) - new_ucmd!() - .arg("-m") + ucmd.arg("-m") .arg("-") .arg("merge_duplicates_1.txt") .pipe_in("apricot\nelderberry\nkiwi\n") .succeeds() - .stdout_only_fixture("merge_mixed_stdin.expected"); + .stdout_is("1\n3\n5\napricot\nelderberry\nkiwi\n"); } #[test] diff --git a/tests/fixtures/sort/merge_duplicates_1.txt b/tests/fixtures/sort/merge_duplicates_1.txt deleted file mode 100644 index 403ec31f0a1..00000000000 --- a/tests/fixtures/sort/merge_duplicates_1.txt +++ /dev/null @@ -1,3 +0,0 @@ -1 -3 -5 diff --git a/tests/fixtures/sort/merge_duplicates_1_1.expected b/tests/fixtures/sort/merge_duplicates_1_1.expected deleted file mode 100644 index 03e6674d634..00000000000 --- a/tests/fixtures/sort/merge_duplicates_1_1.expected +++ /dev/null @@ -1,6 +0,0 @@ -1 -1 -3 -3 -5 -5 diff --git a/tests/fixtures/sort/merge_duplicates_1_1_2.expected b/tests/fixtures/sort/merge_duplicates_1_1_2.expected deleted file mode 100644 index f51c0cf2211..00000000000 --- a/tests/fixtures/sort/merge_duplicates_1_1_2.expected +++ /dev/null @@ -1,8 +0,0 @@ -1 -1 -2 -3 -4 -5 -5 -6 diff --git a/tests/fixtures/sort/merge_duplicates_1_2_1.expected b/tests/fixtures/sort/merge_duplicates_1_2_1.expected deleted file mode 100644 index 2af073d821b..00000000000 --- a/tests/fixtures/sort/merge_duplicates_1_2_1.expected +++ /dev/null @@ -1,9 +0,0 @@ -1 -1 -2 -3 -3 -4 -5 -5 -6 diff --git a/tests/fixtures/sort/merge_duplicates_2.txt b/tests/fixtures/sort/merge_duplicates_2.txt deleted file mode 100644 index e2ba1efb18b..00000000000 --- a/tests/fixtures/sort/merge_duplicates_2.txt +++ /dev/null @@ -1,3 +0,0 @@ -2 -4 -6 From dd682c076d72a36b7160abe520cfec7b94fd15d0 Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Fri, 24 Apr 2026 14:04:10 +0200 Subject: [PATCH 06/19] tech: Fix use of unwrap(false) --- src/uu/sort/src/merge.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index 194a8eedef9..4e85fe759f7 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -81,12 +81,9 @@ pub fn merge( // via mmap while writing to the same file, without needing a temp copy. let output_as_input = if let Some(name) = output.as_output_name() { let output_path = Path::new(name).canonicalize()?; - let appears = files.iter().any(|f| { - Path::new(f) - .canonicalize() - .map(|p| p == output_path) - .unwrap_or(false) - }); + let appears = files + .iter() + .any(|f| Path::new(f).canonicalize().is_ok_and(|p| p == output_path)); if appears { let read_fd = File::open(name).map_err(|error| SortError::ReadFailed { path: output_path.clone(), From 9c00dac72354a1df53bf82fcbe39cff8fb1a3737 Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Fri, 24 Apr 2026 14:38:39 +0200 Subject: [PATCH 07/19] fix: csspell - Rename Mmap to FileRead, use memory-map in comment and alias Mmap as MemoryMap --- src/uu/sort/src/merge.rs | 32 ++++++------ src/uu/sort/src/sort_input.rs | 96 ++++++++++++++++++----------------- 2 files changed, 66 insertions(+), 62 deletions(-) diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index 4e85fe759f7..1c24f80c9c7 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -28,7 +28,7 @@ use std::{ }; use compare::Compare; -use memmap2::Mmap; +use memmap2::Mmap as MemoryMap; use uucore::error::{FromIo, UResult}; use crate::{ @@ -76,9 +76,9 @@ pub fn merge( output: Output, tmp_dir: &mut TmpDirWrapper, ) -> UResult<()> { - // If the output file is also listed as an input, pre-mmap it before + // If the output file is also listed as an input, Use memory-map to load it before // it gets opened for writing. This allows reading the original content - // via mmap while writing to the same file, without needing a temp copy. + // via memory-map while writing to the same file, without needing a temp copy. let output_as_input = if let Some(name) = output.as_output_name() { let output_path = Path::new(name).canonicalize()?; let appears = files @@ -89,16 +89,17 @@ pub fn merge( path: output_path.clone(), error, })?; - // SAFETY: We keep the read_fd open for the lifetime of the mmap, + // SAFETY: We keep the read_fd open for the lifetime of the memory-map, // and we only read from it. The file is not modified while the - // mmap exists (writing happens later via a separate FD). - let mmap = Arc::new(unsafe { Mmap::map(&read_fd) }.map_err(|error| { - SortError::ReadFailed { - path: output_path.clone(), - error, - } - })?); - Some((output_path, mmap)) + // memory-map exists (writing happens later via a separate FD). + let output_as_input = + Arc::new(unsafe { MemoryMap::map(&read_fd) }.map_err(|error| { + SortError::ReadFailed { + path: output_path.clone(), + error, + } + })?); + Some((output_path, output_as_input)) } else { None } @@ -640,10 +641,11 @@ mod tests { // Check Opened SortInputs: 7 inputs but only 6 unique sources let output_canon = out.path().canonicalize().unwrap(); let read_fd = File::open(out.path()).unwrap(); - let output_mmap = Arc::new(unsafe { Mmap::map(&read_fd).unwrap() }); + let output_as_input = Arc::new(unsafe { MemoryMap::map(&read_fd).unwrap() }); let sort_inputs = - SortInputs::from_files_with_output(&files, Some((output_canon, output_mmap))).unwrap(); + SortInputs::from_files_with_output(&files, Some((output_canon, output_as_input))) + .unwrap(); assert_eq!(sort_inputs.len(), 7); assert_eq!(sort_inputs.unique_count(), 6); @@ -656,7 +658,7 @@ mod tests { merge(&mut files_mut, &settings, output, &mut tmp_dir).unwrap(); - // If merge succeeded with only 6 unique sources, mmap deduplication worked. + // If merge succeeded with only 6 unique sources, memory-map deduplication worked. // Verify correctness. let result = fs::read_to_string(out.path()).unwrap(); assert_eq!(result, "1\n2\n3\n4\n5\n6\n6\n"); diff --git a/src/uu/sort/src/sort_input.rs b/src/uu/sort/src/sort_input.rs index 65fbb502237..601476f9030 100644 --- a/src/uu/sort/src/sort_input.rs +++ b/src/uu/sort/src/sort_input.rs @@ -4,7 +4,7 @@ // file that was distributed with this source code. //! Input file handling for sort merge. //! -//! Dedupes duplicate paths via mmap and defers opening unique files until +//! Dedupes duplicate paths via memory-map and defers opening unique files until //! iteration so `merge_with_file_limit` respects its batch size. use std::{ @@ -19,7 +19,7 @@ use std::{ }, }; -use memmap2::Mmap; +use memmap2::Mmap as MemoryMap; use uucore::error::UResult; use crate::{STDIN_FILE, SortError}; @@ -31,8 +31,8 @@ enum SortInputInner { File(File), /// A memory-mapped file shared across duplicate paths, with an /// independent cursor per instance. - Mmap { - data: Arc, + FileRead { + data: Arc, offset: AtomicUsize, }, Stdin, @@ -40,7 +40,7 @@ enum SortInputInner { LazyFile(PathBuf), } -/// Handle to a single sort input (file, mmap, or stdin). +/// Handle to a single sort input (file, memory-map, or stdin). #[derive(Debug)] pub struct SortInput { inner: SortInputInner, @@ -68,9 +68,9 @@ impl SortInput { } } - fn from_mmap(data: Arc) -> Self { + fn from_memory_map(data: Arc) -> Self { Self { - inner: SortInputInner::Mmap { + inner: SortInputInner::FileRead { data, offset: AtomicUsize::new(0), }, @@ -93,7 +93,7 @@ impl Read for SortInput { match &mut self.inner { SortInputInner::File(file) => file.read(buf), - SortInputInner::Mmap { data, offset } => { + SortInputInner::FileRead { data, offset } => { let pos = offset.load(Ordering::Relaxed); let available = data.len().saturating_sub(pos); let to_read = buf.len().min(available); @@ -120,7 +120,7 @@ impl From for Box { /// Collection of sort inputs. /// -/// Preserves argument order and multiplicity. Duplicate paths are mmap'd once +/// Preserves argument order and multiplicity. Duplicate paths are memory-map'd once /// and shared; unique paths are stored lazily and opened during iteration so the /// merge batch size limits active FDs. #[derive(Debug)] @@ -137,12 +137,12 @@ impl SortInputs { /// Build a `SortInputs` from paths. /// - /// - Duplicate paths → one mmap shared across all instances. + /// - Duplicate paths → one memory-map shared across all instances. /// - Unique paths → stored as `LazyFile` and opened when the iterator yields them. - /// - `output_as_input`: pre-created mmap used when the output file is also an input. + /// - `output_as_input`: pre-created memory-map used when the output file is also an input. pub fn from_files_with_output( files: &[std::ffi::OsString], - output_as_input: Option<(PathBuf, Arc)>, + output_as_input: Option<(PathBuf, Arc)>, ) -> UResult { let mut inputs = Vec::with_capacity(files.len()); @@ -158,10 +158,10 @@ impl SortInputs { // Second pass: build inputs // - Unique files: LazyFile (opened during iteration) - // - Duplicate files: mmap once, share it - // - Output-as-input: use pre-created mmap + // - Duplicate files: memory-map once, share it + // - Output-as-input: use pre-created memory-map // - Stdin: single occurrence - let mut opened_files: HashMap> = HashMap::new(); + let mut opened_files: HashMap> = HashMap::new(); for file in files { if file == STDIN_FILE { @@ -173,34 +173,35 @@ impl SortInputs { let canonical = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); // Check if this is the output file used as input - if let Some((ref output_path, ref output_mmap)) = output_as_input { + if let Some((ref output_path, ref output_memory_map)) = output_as_input { if canonical == *output_path { - inputs.push(SortInput::from_mmap(output_mmap.clone())); + inputs.push(SortInput::from_memory_map(output_memory_map.clone())); continue; } } if *path_counts.get(&canonical).unwrap_or(&0) > 1 { - // Duplicate file: use mmap - let mmap = if let Some(mmap) = opened_files.get(&canonical) { - mmap.clone() + // Duplicate file: use memory-map + let memory_map = if let Some(memory_map) = opened_files.get(&canonical) { + memory_map.clone() } else { let f = File::open(path).map_err(|error| SortError::ReadFailed { path: path.to_owned(), error, })?; - // SAFETY: We keep the file open for the lifetime of the mmap, + // SAFETY: We keep the file open for the lifetime of the memory-map, // and we only read from it. The file is not modified. - let mmap = Arc::new(unsafe { Mmap::map(&f) }.map_err(|error| { - SortError::ReadFailed { - path: path.to_owned(), - error, - } - })?); - opened_files.insert(canonical, mmap.clone()); - mmap + let memory_map = + Arc::new(unsafe { MemoryMap::map(&f) }.map_err(|error| { + SortError::ReadFailed { + path: path.to_owned(), + error, + } + })?); + opened_files.insert(canonical, memory_map.clone()); + memory_map }; - inputs.push(SortInput::from_mmap(mmap)); + inputs.push(SortInput::from_memory_map(memory_map)); } else { // Unique file: defer opening until iteration so that // merge_with_file_limit can respect its batch_size and @@ -227,14 +228,14 @@ impl SortInputs { self.inputs.is_empty() } - /// Returns the number of unique sources (stdin + unique files + mmap groups). + /// Returns the number of unique sources (stdin + unique files + memory-map groups). #[allow(dead_code)] pub fn unique_count(&self) -> usize { let mut file_count = 0; let mut stdin_present = false; - // Count unique mmap instances by Arc pointer - let mut seen_mmaps = std::collections::HashSet::new(); + // Count unique memory-map instances by Arc pointer + let mut seen_memory_maps = std::collections::HashSet::new(); for input in &self.inputs { match &input.inner { @@ -244,13 +245,13 @@ impl SortInputs { SortInputInner::File(_) | SortInputInner::LazyFile(_) => { file_count += 1; } - SortInputInner::Mmap { data, .. } => { - seen_mmaps.insert(Arc::as_ptr(data)); + SortInputInner::FileRead { data, .. } => { + seen_memory_maps.insert(Arc::as_ptr(data)); } } } - file_count + seen_mmaps.len() + usize::from(stdin_present) + file_count + seen_memory_maps.len() + usize::from(stdin_present) } /// Iterate over the inputs without consuming them. @@ -342,16 +343,17 @@ mod tests { } #[test] - fn test_sort_input_mmap_read() { + fn test_sort_input_memory_map_read() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); tmpfile - .write_all(b"mmap test data") + .write_all(b"memory_map test data") .expect("should write to temp file"); tmpfile.flush().expect("should flush temp file"); let file = File::open(tmpfile.path()).expect("should open temp file"); - let mmap = Arc::new(unsafe { Mmap::map(&file).expect("should mmap temp file") }); - let mut input = SortInput::from_mmap(mmap); + let memory_map = + Arc::new(unsafe { MemoryMap::map(&file).expect("should memory_map temp file") }); + let mut input = SortInput::from_memory_map(memory_map); let mut buf = [0u8; 14]; let n = input.read(&mut buf).expect("should read from input"); @@ -368,8 +370,8 @@ mod tests { tmpfile.flush().expect("should flush temp file"); let file = File::open(tmpfile.path()).expect("should open temp file"); - let mmap = Arc::new(unsafe { Mmap::map(&file).expect("should mmap temp file") }); - let input = SortInput::from_mmap(mmap); + let mmap = Arc::new(unsafe { MemoryMap::map(&file).expect("should mmap temp file") }); + let input = SortInput::from_memory_map(mmap); let mut reader: Box = input.into(); let mut buf = [0u8; 9]; let n = reader @@ -388,10 +390,10 @@ mod tests { tmpfile.flush().expect("should flush temp file"); let file = File::open(tmpfile.path()).expect("should open temp file"); - let mmap = Arc::new(unsafe { Mmap::map(&file).expect("should mmap temp file") }); + let mmap = Arc::new(unsafe { MemoryMap::map(&file).expect("should mmap temp file") }); - let mut input1 = SortInput::from_mmap(mmap.clone()); - let mut input2 = SortInput::from_mmap(mmap); + let mut input1 = SortInput::from_memory_map(mmap.clone()); + let mut input2 = SortInput::from_memory_map(mmap); // Both should be able to read independently let mut buf1 = [0u8; 11]; @@ -493,7 +495,7 @@ mod tests { let mut buf1 = [0u8; 11]; let mut input1 = SortInput { inner: match &inputs.iter().next().expect("should get first input").inner { - SortInputInner::Mmap { data, offset } => SortInputInner::Mmap { + SortInputInner::FileRead { data, offset } => SortInputInner::FileRead { data: data.clone(), offset: AtomicUsize::new(offset.load(Ordering::Relaxed)), }, @@ -508,7 +510,7 @@ mod tests { let mut buf2 = [0u8; 11]; let mut input2 = SortInput { inner: match &inputs.iter().nth(1).expect("should get second input").inner { - SortInputInner::Mmap { data, offset } => SortInputInner::Mmap { + SortInputInner::FileRead { data, offset } => SortInputInner::FileRead { data: data.clone(), offset: AtomicUsize::new(offset.load(Ordering::Relaxed)), }, From df0cfe44d154bd425b15cdf850938290b26cca37 Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Sun, 26 Apr 2026 13:54:41 +0200 Subject: [PATCH 08/19] fix: Delete unused files in test --- tests/fixtures/sort/merge_mixed_stdin.expected | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 tests/fixtures/sort/merge_mixed_stdin.expected diff --git a/tests/fixtures/sort/merge_mixed_stdin.expected b/tests/fixtures/sort/merge_mixed_stdin.expected deleted file mode 100644 index a46fadad31f..00000000000 --- a/tests/fixtures/sort/merge_mixed_stdin.expected +++ /dev/null @@ -1,6 +0,0 @@ -1 -3 -5 -apricot -elderberry -kiwi From 7d992f21a85d7eea63ebe0d4fdd05428d501c24b Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Sun, 26 Apr 2026 14:14:11 +0200 Subject: [PATCH 09/19] Refactor: Extract ouput-as-input as a function --- src/uu/sort/src/merge.rs | 65 +++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index 1c24f80c9c7..3019c03020f 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -66,6 +66,40 @@ fn effective_merge_batch_size(settings: &GlobalSettings) -> usize { batch_size } +/// If the output file is also listed as an input, use memory-map to load it before +/// it gets opened for writing. This allows reading the original content +/// via memory-map while writing to the same file, without needing a temp copy. +fn load_output_as_input( + output: &Output, + files: &[OsString], +) -> UResult)>> { + let Some(name) = output.as_output_name() else { + return Ok(None); + }; + let output_path = Path::new(name).canonicalize()?; + let appears = files + .iter() + .any(|f| Path::new(f).canonicalize().is_ok_and(|p| p == output_path)); + if appears { + let read_fd = File::open(name).map_err(|error| SortError::ReadFailed { + path: output_path.clone(), + error, + })?; + // SAFETY: We keep the read_fd open for the lifetime of the memory-map, + // and we only read from it. The file is not modified while the + // memory-map exists (writing happens later via a separate FD). + let output_as_input = Arc::new(unsafe { MemoryMap::map(&read_fd) }.map_err(|error| { + SortError::ReadFailed { + path: output_path.clone(), + error, + } + })?); + Ok(Some((output_path, output_as_input))) + } else { + Ok(None) + } +} + /// Merge pre-sorted `Box`s. /// /// If `settings.merge_batch_size` is greater than the length of `files`, intermediate files will be used. @@ -76,36 +110,7 @@ pub fn merge( output: Output, tmp_dir: &mut TmpDirWrapper, ) -> UResult<()> { - // If the output file is also listed as an input, Use memory-map to load it before - // it gets opened for writing. This allows reading the original content - // via memory-map while writing to the same file, without needing a temp copy. - let output_as_input = if let Some(name) = output.as_output_name() { - let output_path = Path::new(name).canonicalize()?; - let appears = files - .iter() - .any(|f| Path::new(f).canonicalize().is_ok_and(|p| p == output_path)); - if appears { - let read_fd = File::open(name).map_err(|error| SortError::ReadFailed { - path: output_path.clone(), - error, - })?; - // SAFETY: We keep the read_fd open for the lifetime of the memory-map, - // and we only read from it. The file is not modified while the - // memory-map exists (writing happens later via a separate FD). - let output_as_input = - Arc::new(unsafe { MemoryMap::map(&read_fd) }.map_err(|error| { - SortError::ReadFailed { - path: output_path.clone(), - error, - } - })?); - Some((output_path, output_as_input)) - } else { - None - } - } else { - None - }; + let output_as_input = load_output_as_input(&output, files)?; let sort_inputs = SortInputs::from_files_with_output(files, output_as_input)?; let files = sort_inputs From 0a12ea31d8e129963047066a8e8404c94d1450d1 Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Sun, 26 Apr 2026 14:14:55 +0200 Subject: [PATCH 10/19] Fix: Failed test du to partial renaming --- src/uu/sort/src/sort_input.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/uu/sort/src/sort_input.rs b/src/uu/sort/src/sort_input.rs index 601476f9030..ea6590da0ba 100644 --- a/src/uu/sort/src/sort_input.rs +++ b/src/uu/sort/src/sort_input.rs @@ -355,10 +355,10 @@ mod tests { Arc::new(unsafe { MemoryMap::map(&file).expect("should memory_map temp file") }); let mut input = SortInput::from_memory_map(memory_map); - let mut buf = [0u8; 14]; + let mut buf = [0u8; 20]; let n = input.read(&mut buf).expect("should read from input"); - assert_eq!(n, 14); - assert_eq!(&buf, b"mmap test data"); + assert_eq!(n, 20); + assert_eq!(&buf, b"memory_map test data"); } #[test] From d6bc678a63d1a983ef683b5f82db144245732ac1 Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Tue, 28 Apr 2026 07:41:18 +0200 Subject: [PATCH 11/19] fix: Ignore merge.rs test as WASI do not support temp dir --- src/uu/sort/src/merge.rs | 1 + src/uu/sort/src/sort_input.rs | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index 3019c03020f..efa7abd3e35 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -611,6 +611,7 @@ impl MergeInput for PlainMergeInput { } #[cfg(test)] +#[cfg(not(target_os = "wasi"))] mod tests { use super::*; use std::io::Write; diff --git a/src/uu/sort/src/sort_input.rs b/src/uu/sort/src/sort_input.rs index ea6590da0ba..b936aa5897e 100644 --- a/src/uu/sort/src/sort_input.rs +++ b/src/uu/sort/src/sort_input.rs @@ -315,10 +315,13 @@ impl IntoIterator for SortInputs { mod tests { use super::*; use std::ffi::OsString; + #[cfg(not(target_os = "wasi"))] use std::io::Write; + #[cfg(not(target_os = "wasi"))] use tempfile::NamedTempFile; #[test] + #[cfg(not(target_os = "wasi"))] fn test_sort_input_new_file() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); tmpfile @@ -343,6 +346,7 @@ mod tests { } #[test] + #[cfg(not(target_os = "wasi"))] fn test_sort_input_memory_map_read() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); tmpfile @@ -362,6 +366,7 @@ mod tests { } #[test] + #[cfg(not(target_os = "wasi"))] fn test_sort_input_into_box_read() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); tmpfile @@ -382,6 +387,7 @@ mod tests { } #[test] + #[cfg(not(target_os = "wasi"))] fn test_sort_input_mmap_independent_reads() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); tmpfile @@ -417,6 +423,7 @@ mod tests { } #[test] + #[cfg(not(target_os = "wasi"))] fn test_sort_inputs_single_file() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); tmpfile @@ -431,6 +438,7 @@ mod tests { } #[test] + #[cfg(not(target_os = "wasi"))] fn test_sort_inputs_multiple_unique() { let mut tmpfile1 = NamedTempFile::new().expect("should create temp file"); tmpfile1 @@ -456,6 +464,7 @@ mod tests { } #[test] + #[cfg(not(target_os = "wasi"))] fn test_sort_inputs_with_duplicates() { let mut tmpfile1 = NamedTempFile::new().expect("should create temp file"); tmpfile1 @@ -478,6 +487,7 @@ mod tests { } #[test] + #[cfg(not(target_os = "wasi"))] fn test_sort_inputs_duplicate_mmap_independent() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); tmpfile @@ -546,6 +556,7 @@ mod tests { } #[test] + #[cfg(not(target_os = "wasi"))] fn test_sort_inputs_mixed_stdin_and_files_allowed() { // Verify that mixing stdin with files is allowed (GNU Coreutils compatible) let mut tmpfile = NamedTempFile::new().expect("should create temp file"); @@ -562,6 +573,7 @@ mod tests { } #[test] + #[cfg(not(target_os = "wasi"))] fn test_sort_inputs_order_preserved() { let mut tmpfile1 = NamedTempFile::new().expect("should create temp file"); tmpfile1 @@ -582,6 +594,7 @@ mod tests { } #[test] + #[cfg(not(target_os = "wasi"))] fn test_sort_inputs_from_files_error() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); tmpfile @@ -599,6 +612,7 @@ mod tests { } #[test] + #[cfg(not(target_os = "wasi"))] fn test_sort_inputs_into_iter() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); tmpfile From f4fd740c67404908e985fa3fd29bf8d53cd2f9e7 Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Tue, 28 Apr 2026 07:51:37 +0200 Subject: [PATCH 12/19] fix: Add Mmap and variant to workspace.wordlist for cspell check --- .vscode/cspell.dictionaries/workspace.wordlist.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.vscode/cspell.dictionaries/workspace.wordlist.txt b/.vscode/cspell.dictionaries/workspace.wordlist.txt index 2cdd3033a8d..de3c15e8386 100644 --- a/.vscode/cspell.dictionaries/workspace.wordlist.txt +++ b/.vscode/cspell.dictionaries/workspace.wordlist.txt @@ -42,6 +42,9 @@ langid lscolors mdbook memchr +memmap +mmap +Mmap multifilereader onig ouroboros From 5c3f3e99971c29e03467803714db722168e8e732 Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Mon, 18 May 2026 17:13:17 +0200 Subject: [PATCH 13/19] fix: Remove print statement --- tests/by-util/test_sort.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index 13fce2976b8..90fa6a3620c 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -1187,7 +1187,6 @@ fn test_merge_duplicate_files_interleaved() { let (at, mut ucmd) = at_and_ucmd!(); at.write("merge_duplicates_1.txt", "1\n3\n5\n"); at.write("merge_duplicates_2.txt", "2\n4\n6\n"); - println!("Sub sir: {}", at.as_string()); // Test merging file1, file2, file1 - same file appears twice ucmd.arg("-m") .arg("merge_duplicates_1.txt") From 3505b90169ebbd411e7b45a750e059112324614d Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Mon, 18 May 2026 19:16:36 +0200 Subject: [PATCH 14/19] Fix: Replace AtomicUsize by primitive usize --- src/uu/sort/src/sort_input.rs | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/src/uu/sort/src/sort_input.rs b/src/uu/sort/src/sort_input.rs index b936aa5897e..f971af3ae4c 100644 --- a/src/uu/sort/src/sort_input.rs +++ b/src/uu/sort/src/sort_input.rs @@ -13,10 +13,7 @@ use std::{ fs::File, io::{self, Read}, path::{Path, PathBuf}, - sync::{ - Arc, - atomic::{AtomicUsize, Ordering}, - }, + sync::Arc, }; use memmap2::Mmap as MemoryMap; @@ -33,7 +30,7 @@ enum SortInputInner { /// independent cursor per instance. FileRead { data: Arc, - offset: AtomicUsize, + offset: usize, }, Stdin, /// A unique file whose open() is deferred until iteration. @@ -70,10 +67,7 @@ impl SortInput { fn from_memory_map(data: Arc) -> Self { Self { - inner: SortInputInner::FileRead { - data, - offset: AtomicUsize::new(0), - }, + inner: SortInputInner::FileRead { data, offset: 0 }, } } @@ -94,12 +88,12 @@ impl Read for SortInput { match &mut self.inner { SortInputInner::File(file) => file.read(buf), SortInputInner::FileRead { data, offset } => { - let pos = offset.load(Ordering::Relaxed); + let pos = *offset; let available = data.len().saturating_sub(pos); let to_read = buf.len().min(available); if to_read > 0 { buf[..to_read].copy_from_slice(&data[pos..pos + to_read]); - offset.fetch_add(to_read, Ordering::Relaxed); + *offset = pos + to_read; } Ok(to_read) } @@ -507,7 +501,7 @@ mod tests { inner: match &inputs.iter().next().expect("should get first input").inner { SortInputInner::FileRead { data, offset } => SortInputInner::FileRead { data: data.clone(), - offset: AtomicUsize::new(offset.load(Ordering::Relaxed)), + offset: *offset, }, _ => panic!("Expected mmap"), }, @@ -522,7 +516,7 @@ mod tests { inner: match &inputs.iter().nth(1).expect("should get second input").inner { SortInputInner::FileRead { data, offset } => SortInputInner::FileRead { data: data.clone(), - offset: AtomicUsize::new(offset.load(Ordering::Relaxed)), + offset: *offset, }, _ => panic!("Expected mmap"), }, From de418108c251eec59668a42c27993d9815a85201 Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Mon, 18 May 2026 19:21:01 +0200 Subject: [PATCH 15/19] doc: Amend //SAFETY comment --- src/uu/sort/src/merge.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index efa7abd3e35..8146e7ce06e 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -86,8 +86,9 @@ fn load_output_as_input( error, })?; // SAFETY: We keep the read_fd open for the lifetime of the memory-map, - // and we only read from it. The file is not modified while the - // memory-map exists (writing happens later via a separate FD). + // and we only read from it and the file is not modified while the + // memory-map exists by the current process (writing happens later via a separate FD). + // Yet, it may be possible that another process alter the content and consequently read corrupted data let output_as_input = Arc::new(unsafe { MemoryMap::map(&read_fd) }.map_err(|error| { SortError::ReadFailed { path: output_path.clone(), From d71cb0257928289aa1d065e616941814e285022b Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Sun, 31 May 2026 17:54:41 +0200 Subject: [PATCH 16/19] Split and move sort_inputs.rs into sort_inputs/ module --- src/uu/sort/src/merge.rs | 2 +- src/uu/sort/src/sort.rs | 2 +- src/uu/sort/src/sort_inputs/iterator.rs | 77 ++++ src/uu/sort/src/sort_inputs/mod.rs | 4 + .../{sort_input.rs => sort_inputs/model.rs} | 332 +++++++----------- 5 files changed, 202 insertions(+), 215 deletions(-) create mode 100644 src/uu/sort/src/sort_inputs/iterator.rs create mode 100644 src/uu/sort/src/sort_inputs/mod.rs rename src/uu/sort/src/{sort_input.rs => sort_inputs/model.rs} (66%) diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index 8146e7ce06e..12d13cc0401 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -35,7 +35,7 @@ use crate::{ GlobalSettings, Output, SortError, chunks::{self, Chunk, RecycledChunk}, compare_by, current_open_fd_count, fd_soft_limit, - sort_input::{SortInput, SortInputs}, + sort_inputs::{SortInput, SortInputs}, tmp_dir::TmpDirWrapper, }; diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 81f26e89081..1dd143772e3 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -17,7 +17,7 @@ mod custom_str_cmp; mod ext_sort; mod merge; mod numeric_str_cmp; -mod sort_input; +mod sort_inputs; mod tmp_dir; use bigdecimal::BigDecimal; diff --git a/src/uu/sort/src/sort_inputs/iterator.rs b/src/uu/sort/src/sort_inputs/iterator.rs new file mode 100644 index 00000000000..a7d4775c79d --- /dev/null +++ b/src/uu/sort/src/sort_inputs/iterator.rs @@ -0,0 +1,77 @@ +use crate::SortError; +use crate::sort_inputs::{SortInput, SortInputInner, SortInputs}; +use std::{fs::File, vec::IntoIter}; +use uucore::error::UResult; + +/// Iterator that opens LazyFile entries as they are yielded. +#[derive(Debug)] +pub struct SortInputsIntoIter { + inner: IntoIter, +} + +impl Iterator for SortInputsIntoIter { + type Item = UResult; + + fn next(&mut self) -> Option { + let mut input = self.inner.next()?; + + if let SortInputInner::LazyFile(path) = &input.inner { + match File::open(path) { + Ok(file) => input.inner = SortInputInner::File(file), + Err(error) => { + return Some(Err(SortError::ReadFailed { + path: path.clone(), + error, + } + .into())); + } + } + } + + Some(Ok(input)) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +impl ExactSizeIterator for SortInputsIntoIter { + fn len(&self) -> usize { + self.inner.len() + } +} + +impl IntoIterator for SortInputs { + type Item = UResult; + type IntoIter = SortInputsIntoIter; + + fn into_iter(self) -> Self::IntoIter { + SortInputsIntoIter { + inner: self.inputs.into_iter(), + } + } +} +#[cfg(test)] +mod test { + + use crate::sort_inputs::SortInputs; + #[cfg(not(target_os = "wasi"))] + use std::io::Write; + #[cfg(not(target_os = "wasi"))] + use tempfile::NamedTempFile; + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_sort_inputs_into_iter() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile + .write_all(b"data") + .expect("should write to temp file"); + + let files = vec![tmpfile.path().as_os_str().to_os_string()]; + let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + let count = inputs.into_iter().count(); + assert_eq!(count, 1); + } +} diff --git a/src/uu/sort/src/sort_inputs/mod.rs b/src/uu/sort/src/sort_inputs/mod.rs new file mode 100644 index 00000000000..bcfb59ed5a0 --- /dev/null +++ b/src/uu/sort/src/sort_inputs/mod.rs @@ -0,0 +1,4 @@ +mod iterator; +mod model; + +pub use model::{SortInput, SortInputInner, SortInputs}; diff --git a/src/uu/sort/src/sort_input.rs b/src/uu/sort/src/sort_inputs/model.rs similarity index 66% rename from src/uu/sort/src/sort_input.rs rename to src/uu/sort/src/sort_inputs/model.rs index f971af3ae4c..0ea4c7f357e 100644 --- a/src/uu/sort/src/sort_input.rs +++ b/src/uu/sort/src/sort_inputs/model.rs @@ -23,7 +23,7 @@ use crate::{STDIN_FILE, SortError}; /// An inner enum representing the actual source of a sort input. #[derive(Debug)] -enum SortInputInner { +pub enum SortInputInner { /// An already-opened regular file. File(File), /// A memory-mapped file shared across duplicate paths, with an @@ -40,51 +40,11 @@ enum SortInputInner { /// Handle to a single sort input (file, memory-map, or stdin). #[derive(Debug)] pub struct SortInput { - inner: SortInputInner, + pub inner: SortInputInner, } impl SortInput { - /// Open a path directly (stdin if `"-"`). - pub fn new(path: &OsStr) -> UResult { - if path == STDIN_FILE { - return Ok(Self { - inner: SortInputInner::Stdin, - }); - } - - let path = Path::new(path); - match File::open(path) { - Ok(f) => Ok(Self { - inner: SortInputInner::File(f), - }), - Err(error) => Err(SortError::ReadFailed { - path: path.to_owned(), - error, - } - .into()), - } - } - - fn from_memory_map(data: Arc) -> Self { - Self { - inner: SortInputInner::FileRead { data, offset: 0 }, - } - } - - /// Returns true if this input is stdin. - pub fn is_stdin(&self) -> bool { - matches!(self.inner, SortInputInner::Stdin) - } -} - -impl Read for SortInput { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - // Open LazyFile on first read (fallback if iterator didn't do it). - if let SortInputInner::LazyFile(path) = &self.inner { - let file = File::open(path)?; - self.inner = SortInputInner::File(file); - } - + fn read_inner(&mut self, buf: &mut [u8]) -> io::Result { match &mut self.inner { SortInputInner::File(file) => file.read(buf), SortInputInner::FileRead { data, offset } => { @@ -101,14 +61,34 @@ impl Read for SortInput { let mut stdin = io::stdin(); stdin.read(buf) } + //All lazyFile are already opened in the SortInputsIntoIter next() method SortInputInner::LazyFile(_) => unreachable!(), } } + + fn from_memory_map(data: Arc) -> Self { + Self { + inner: SortInputInner::FileRead { data, offset: 0 }, + } + } + + fn stdin() -> Self { + Self { + inner: SortInputInner::Stdin, + } + } + + fn from_path(file: &OsStr) -> Self { + let path = Path::new(file); + Self { + inner: SortInputInner::LazyFile(path.to_path_buf()), + } + } } -impl From for Box { - fn from(input: SortInput) -> Self { - Box::new(input) as Box +impl Read for SortInput { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.read_inner(buf) } } @@ -119,16 +99,10 @@ impl From for Box { /// merge batch size limits active FDs. #[derive(Debug)] pub struct SortInputs { - inputs: Vec, + pub inputs: Vec, } impl SortInputs { - #[allow(dead_code)] - // Used only in unit tests. - pub fn from_files(files: &[std::ffi::OsString]) -> UResult { - Self::from_files_with_output(files, None) - } - /// Build a `SortInputs` from paths. /// /// - Duplicate paths → one memory-map shared across all instances. @@ -159,160 +133,110 @@ impl SortInputs { for file in files { if file == STDIN_FILE { - inputs.push(SortInput { - inner: SortInputInner::Stdin, - }); - } else { - let path = Path::new(file); - let canonical = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); - - // Check if this is the output file used as input - if let Some((ref output_path, ref output_memory_map)) = output_as_input { - if canonical == *output_path { - inputs.push(SortInput::from_memory_map(output_memory_map.clone())); - continue; - } - } + inputs.push(SortInput::stdin()); + continue; + } - if *path_counts.get(&canonical).unwrap_or(&0) > 1 { - // Duplicate file: use memory-map - let memory_map = if let Some(memory_map) = opened_files.get(&canonical) { - memory_map.clone() - } else { - let f = File::open(path).map_err(|error| SortError::ReadFailed { - path: path.to_owned(), - error, - })?; - // SAFETY: We keep the file open for the lifetime of the memory-map, - // and we only read from it. The file is not modified. - let memory_map = - Arc::new(unsafe { MemoryMap::map(&f) }.map_err(|error| { - SortError::ReadFailed { - path: path.to_owned(), - error, - } - })?); - opened_files.insert(canonical, memory_map.clone()); - memory_map - }; - inputs.push(SortInput::from_memory_map(memory_map)); - } else { - // Unique file: defer opening until iteration so that - // merge_with_file_limit can respect its batch_size and - // avoid exceeding the file-descriptor soft limit. - inputs.push(SortInput { - inner: SortInputInner::LazyFile(path.to_path_buf()), - }); + let path = Path::new(file); + let canonical = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); + // Check if this is the output file used as input + // Then use already opened fd for output + if let Some((ref output_path, ref output_memory_map)) = output_as_input { + if canonical == *output_path { + inputs.push(SortInput::from_memory_map(output_memory_map.clone())); + continue; } } - } - Ok(Self { inputs }) - } + // Unique file as input + if *path_counts.get(&canonical).unwrap_or(&0) <= 1 { + inputs.push(SortInput::from_path(file)); + continue; + } - /// Returns the total number of inputs (including duplicates). - #[allow(dead_code)] - pub fn len(&self) -> usize { - self.inputs.len() - } + // Duplicate file: use memory-map + let memory_map = if let Some(memory_map) = opened_files.get(&canonical) { + memory_map.clone() + } else { + let f = File::open(path).map_err(|error| SortError::ReadFailed { + path: path.to_owned(), + error, + })?; + // SAFETY: We keep the file open for the lifetime of the memory-map, + // and we only read from it. The file is not modified. + let memory_map = Arc::new(unsafe { MemoryMap::map(&f) }.map_err(|error| { + SortError::ReadFailed { + path: path.to_owned(), + error, + } + })?); + opened_files.insert(canonical, memory_map.clone()); + memory_map + }; + inputs.push(SortInput::from_memory_map(memory_map)); + } - /// Returns true if there are no inputs. - #[allow(dead_code)] - pub fn is_empty(&self) -> bool { - self.inputs.is_empty() + Ok(Self { + inputs: inputs.into_iter().collect(), + }) } +} - /// Returns the number of unique sources (stdin + unique files + memory-map groups). - #[allow(dead_code)] - pub fn unique_count(&self) -> usize { - let mut file_count = 0; - let mut stdin_present = false; - - // Count unique memory-map instances by Arc pointer - let mut seen_memory_maps = std::collections::HashSet::new(); +#[cfg(test)] +mod tests { + use super::*; + use std::ffi::OsString; + #[cfg(not(target_os = "wasi"))] + use std::io::Write; + #[cfg(not(target_os = "wasi"))] + use tempfile::NamedTempFile; - for input in &self.inputs { - match &input.inner { - SortInputInner::Stdin => { - stdin_present = true; - } - SortInputInner::File(_) | SortInputInner::LazyFile(_) => { - file_count += 1; - } - SortInputInner::FileRead { data, .. } => { - seen_memory_maps.insert(Arc::as_ptr(data)); - } - } + // Util method for SortInputs in test + impl SortInputs { + pub fn from_files(files: &[OsString]) -> UResult { + Self::from_files_with_output(files, None) } - file_count + seen_memory_maps.len() + usize::from(stdin_present) - } - - /// Iterate over the inputs without consuming them. - #[allow(dead_code)] - pub fn iter(&self) -> impl Iterator { - self.inputs.iter() - } -} + /// Returns the total number of inputs (including duplicates). + pub fn len(&self) -> usize { + self.inputs.len() + } -/// Iterator that opens LazyFile entries as they are yielded. -#[derive(Debug)] -pub struct SortInputsIntoIter { - inner: std::vec::IntoIter, -} + /// Returns true if there are no inputs. + fn is_empty(&self) -> bool { + self.inputs.is_empty() + } -impl Iterator for SortInputsIntoIter { - type Item = UResult; + /// Returns the number of unique sources (stdin + unique files + memory-map groups). + pub fn unique_count(&self) -> usize { + let mut file_count = 0; + let mut stdin_present = false; - fn next(&mut self) -> Option { - let mut input = self.inner.next()?; + // Count unique memory-map instances by Arc pointer + let mut seen_memory_maps = std::collections::HashSet::new(); - // Convert LazyFile to File so errors surface during iteration. - if let SortInputInner::LazyFile(path) = &input.inner { - match File::open(path) { - Ok(file) => input.inner = SortInputInner::File(file), - Err(error) => { - return Some(Err(SortError::ReadFailed { - path: path.clone(), - error, + for input in &self.inputs { + match &input.inner { + SortInputInner::Stdin => { + stdin_present = true; + } + SortInputInner::File(_) | SortInputInner::LazyFile(_) => { + file_count += 1; + } + SortInputInner::FileRead { data, .. } => { + seen_memory_maps.insert(Arc::as_ptr(data)); } - .into())); } } - } - Some(Ok(input)) - } - fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() - } -} - -impl ExactSizeIterator for SortInputsIntoIter { - fn len(&self) -> usize { - self.inner.len() - } -} - -impl IntoIterator for SortInputs { - type Item = UResult; - type IntoIter = SortInputsIntoIter; + file_count + seen_memory_maps.len() + usize::from(stdin_present) + } - fn into_iter(self) -> Self::IntoIter { - SortInputsIntoIter { - inner: self.inputs.into_iter(), + /// Iterate over the inputs without consuming them. + fn iter(&self) -> impl Iterator { + self.inputs.iter() } } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::ffi::OsString; - #[cfg(not(target_os = "wasi"))] - use std::io::Write; - #[cfg(not(target_os = "wasi"))] - use tempfile::NamedTempFile; #[test] #[cfg(not(target_os = "wasi"))] @@ -323,20 +247,21 @@ mod tests { .expect("should write to temp file"); tmpfile.flush().expect("should flush temp file"); - let input = SortInput::new(tmpfile.path().as_os_str()).expect("should create sort input"); - assert!(!input.is_stdin()); + let input = SortInput::from_path(tmpfile.path().as_os_str()); + assert!(!matches!(input.inner, SortInputInner::Stdin)); } #[test] fn test_sort_input_new_stdin() { - let input = SortInput::new(OsStr::new("-")).expect("should create sort input for stdin"); - assert!(input.is_stdin()); + let input = SortInput::stdin(); + assert!(matches!(input.inner, SortInputInner::Stdin)); } #[test] fn test_sort_input_new_missing_file() { - let result = SortInput::new(OsStr::new("/nonexistent/path/file.txt")); - assert!(result.is_err()); + let file = OsStr::new("/nonexistent/path/file.txt"); + let lazy_input = SortInput::from_path(&file); + assert!(matches!(lazy_input.inner, SortInputInner::LazyFile(_))); } #[test] @@ -371,7 +296,7 @@ mod tests { let file = File::open(tmpfile.path()).expect("should open temp file"); let mmap = Arc::new(unsafe { MemoryMap::map(&file).expect("should mmap temp file") }); let input = SortInput::from_memory_map(mmap); - let mut reader: Box = input.into(); + let mut reader: Box = Box::new(input); let mut buf = [0u8; 9]; let n = reader .read(&mut buf) @@ -531,14 +456,9 @@ mod tests { fn test_sort_inputs_stdin_only() { let files = vec![OsString::from("-")]; let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + let input = inputs.iter().next().expect("should get first input"); assert_eq!(inputs.len(), 1); - assert!( - inputs - .iter() - .next() - .expect("should get first input") - .is_stdin() - ); + assert!(matches!(input.inner, SortInputInner::Stdin)); } #[test] @@ -604,18 +524,4 @@ mod tests { assert!(iter.next().expect("should get first input").is_ok()); // first file opens successfully assert!(iter.next().expect("should get second input").is_err()); // second file fails to open } - - #[test] - #[cfg(not(target_os = "wasi"))] - fn test_sort_inputs_into_iter() { - let mut tmpfile = NamedTempFile::new().expect("should create temp file"); - tmpfile - .write_all(b"data") - .expect("should write to temp file"); - - let files = vec![tmpfile.path().as_os_str().to_os_string()]; - let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); - let count = inputs.into_iter().count(); - assert_eq!(count, 1); - } } From 6efe6e3599e0bfd8a6d073254c3ddf3230cd98ab Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Mon, 1 Jun 2026 17:11:35 +0200 Subject: [PATCH 17/19] tech: Refactoring to better modelize 'data' workflow and segmentation. Use DeferredInput as read from cmd line from OpenedInput which are used to read and create FD --- src/uu/sort/src/merge.rs | 4 +- src/uu/sort/src/sort_inputs/iterator.rs | 276 +++++++++++++++++++++--- src/uu/sort/src/sort_inputs/mod.rs | 2 +- src/uu/sort/src/sort_inputs/model.rs | 240 ++++++++++----------- 4 files changed, 369 insertions(+), 153 deletions(-) diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index 12d13cc0401..b697738e531 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -35,7 +35,7 @@ use crate::{ GlobalSettings, Output, SortError, chunks::{self, Chunk, RecycledChunk}, compare_by, current_open_fd_count, fd_soft_limit, - sort_inputs::{SortInput, SortInputs}, + sort_inputs::{OpenedInput, SortInputs}, tmp_dir::TmpDirWrapper, }; @@ -116,7 +116,7 @@ pub fn merge( let sort_inputs = SortInputs::from_files_with_output(files, output_as_input)?; let files = sort_inputs .into_iter() - .map(|result| result.map(|input| PlainMergeInput:: { inner: input })); + .map(|result| result.map(|input| PlainMergeInput:: { inner: input })); if settings.compress_prog.is_none() { merge_with_file_limit::<_, _, WriteablePlainTmpFile>(files, settings, output, tmp_dir) } else { diff --git a/src/uu/sort/src/sort_inputs/iterator.rs b/src/uu/sort/src/sort_inputs/iterator.rs index a7d4775c79d..053a3821e91 100644 --- a/src/uu/sort/src/sort_inputs/iterator.rs +++ b/src/uu/sort/src/sort_inputs/iterator.rs @@ -1,34 +1,78 @@ use crate::SortError; -use crate::sort_inputs::{SortInput, SortInputInner, SortInputs}; -use std::{fs::File, vec::IntoIter}; +use crate::sort_inputs::{DeferredInput, InputAccess, OpenedInput, SortInput, SortInputs}; +use memmap2::Mmap as MemoryMap; +use std::{collections::HashMap, fs::File, path::PathBuf, sync::Arc, vec::IntoIter}; use uucore::error::UResult; -/// Iterator that opens LazyFile entries as they are yielded. -#[derive(Debug)] +/// Iterator that opens deferred input entries as they are yielded. +#[derive(Debug, Default)] pub struct SortInputsIntoIter { inner: IntoIter, + memory_map_files: HashMap>, +} + +impl SortInputsIntoIter { + fn open_file(path: PathBuf) -> UResult { + File::open(&path) + .map(OpenedInput::File) + .map_err(|error| SortError::ReadFailed { path, error }.into()) + } + + fn shared_memory_map(&mut self, path: PathBuf) -> UResult { + if let Some(memory_map) = self.memory_map_files.get(&path) { + return Ok(OpenedInput::SharedMemoryMap { + data: memory_map.clone(), + offset: 0, + }); + } + + let file = File::open(&path).map_err(|error| SortError::ReadFailed { + path: path.clone(), + error, + })?; + + // SAFETY: This creates a read-only memory map for an input file. The map is + // only exposed through `OpenedInput::SharedMemoryMap`, which implements + // `Read` by copying bytes out and never mutates the mapped region. + let memory_map = + Arc::new( + unsafe { MemoryMap::map(&file) }.map_err(|error| SortError::ReadFailed { + path: path.clone(), + error, + })?, + ); + + self.memory_map_files.insert(path, memory_map.clone()); + + Ok(OpenedInput::SharedMemoryMap { + data: memory_map, + offset: 0, + }) + } } impl Iterator for SortInputsIntoIter { - type Item = UResult; + type Item = UResult; fn next(&mut self) -> Option { - let mut input = self.inner.next()?; - - if let SortInputInner::LazyFile(path) = &input.inner { - match File::open(path) { - Ok(file) => input.inner = SortInputInner::File(file), - Err(error) => { - return Some(Err(SortError::ReadFailed { - path: path.clone(), - error, - } - .into())); - } + let input = self.inner.next()?; + + let opened_input = match input.inner { + DeferredInput::Stdin => Ok(OpenedInput::Stdin), + DeferredInput::Path { + path, + access: InputAccess::OpenFile, + } => Self::open_file(path), + DeferredInput::Path { + path, + access: InputAccess::SharedMemoryMap, + } => self.shared_memory_map(path), + DeferredInput::OutputSnapshot(data) => { + Ok(OpenedInput::SharedMemoryMap { data, offset: 0 }) } - } + }; - Some(Ok(input)) + Some(opened_input) } fn size_hint(&self) -> (usize, Option) { @@ -43,35 +87,209 @@ impl ExactSizeIterator for SortInputsIntoIter { } impl IntoIterator for SortInputs { - type Item = UResult; + type Item = UResult; type IntoIter = SortInputsIntoIter; fn into_iter(self) -> Self::IntoIter { SortInputsIntoIter { inner: self.inputs.into_iter(), + memory_map_files: HashMap::new(), } } } + #[cfg(test)] mod test { - - use crate::sort_inputs::SortInputs; + use super::SortInputsIntoIter; + use crate::sort_inputs::{DeferredInput, InputAccess, OpenedInput, SortInput, SortInputs}; + use memmap2::Mmap as MemoryMap; #[cfg(not(target_os = "wasi"))] - use std::io::Write; + use std::{ + fs::File, + io::{Read, Write}, + sync::Arc, + }; #[cfg(not(target_os = "wasi"))] use tempfile::NamedTempFile; #[test] #[cfg(not(target_os = "wasi"))] - fn test_sort_inputs_into_iter() { + fn test_open_file_opens_as_file() { let mut tmpfile = NamedTempFile::new().expect("should create temp file"); tmpfile - .write_all(b"data") + .write_all(b"unique file data") .expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let opened_input = + SortInputsIntoIter::open_file(tmpfile.path().to_path_buf()).expect("should open input"); + + match opened_input { + OpenedInput::File(mut file) => { + let mut contents = String::new(); + file.read_to_string(&mut contents) + .expect("should read opened file"); + assert_eq!(contents, "unique file data"); + } + other => panic!("expected opened file, got {other:?}"), + } + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_shared_memory_map_opens_as_shared_memory_map() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile + .write_all(b"duplicate data") + .expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let mut iter = SortInputsIntoIter::default(); + let opened_input = iter + .shared_memory_map(tmpfile.path().to_path_buf()) + .expect("should open shared memory map"); + + match opened_input { + OpenedInput::SharedMemoryMap { data, offset } => { + assert_eq!(offset, 0); + assert_eq!(&data[..], b"duplicate data"); + } + other => panic!("expected shared memory map, got {other:?}"), + } + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_shared_memory_maps_have_independent_offsets() { + let mut tmpfile = NamedTempFile::new().expect("should create temp file"); + tmpfile + .write_all(b"independent offsets") + .expect("should write to temp file"); + tmpfile.flush().expect("should flush temp file"); + + let path = tmpfile.path().to_path_buf(); + let mut iter = SortInputsIntoIter::default(); + let mut first = iter + .shared_memory_map(path.clone()) + .expect("should open first shared memory map"); + let mut second = iter + .shared_memory_map(path) + .expect("should open second shared memory map"); + + let mut first_buf = [0u8; 11]; + first + .read_exact(&mut first_buf) + .expect("should read from first mmap"); + assert_eq!(&first_buf, b"independent"); + + let mut second_buf = [0u8; 11]; + second + .read_exact(&mut second_buf) + .expect("should read from second mmap"); + assert_eq!(&second_buf, b"independent"); + } + + #[test] + #[cfg(not(target_os = "wasi"))] + fn test_sort_inputs_into_iter_opens_each_input_kind() { + let mut open_file_input = NamedTempFile::new().expect("should create open-file input"); + open_file_input + .write_all(b"open file input") + .expect("should write open-file input"); + open_file_input + .flush() + .expect("should flush open-file input"); + + let mut shared_memory_input = + NamedTempFile::new().expect("should create shared-memory input"); + shared_memory_input + .write_all(b"shared memory input") + .expect("should write shared-memory input"); + shared_memory_input + .flush() + .expect("should flush shared-memory input"); + + let mut output = NamedTempFile::new().expect("should create output file"); + output + .write_all(b"output snapshot") + .expect("should write output file"); + output.flush().expect("should flush output file"); + let output_file = File::open(output.path()).expect("should open output for snapshot"); + let output_snapshot = + Arc::new(unsafe { MemoryMap::map(&output_file).expect("should mmap output snapshot") }); + + let inputs = SortInputs { + inputs: vec![ + SortInput { + inner: DeferredInput::Stdin, + }, + SortInput { + inner: DeferredInput::Path { + path: open_file_input.path().to_path_buf(), + access: InputAccess::OpenFile, + }, + }, + SortInput { + inner: DeferredInput::Path { + path: shared_memory_input.path().to_path_buf(), + access: InputAccess::SharedMemoryMap, + }, + }, + SortInput { + inner: DeferredInput::OutputSnapshot(output_snapshot), + }, + ], + }; + + let mut iter = inputs.into_iter(); + assert_eq!(iter.len(), 4); + + assert!(matches!( + iter.next() + .expect("should yield stdin") + .expect("stdin is valid"), + OpenedInput::Stdin + )); + assert_eq!(iter.len(), 3); + + match iter + .next() + .expect("should yield open-file input") + .expect("should open file") + { + OpenedInput::File(mut file) => { + let mut contents = String::new(); + file.read_to_string(&mut contents) + .expect("should read opened file"); + assert_eq!(contents, "open file input"); + } + other => panic!("expected opened file, got {other:?}"), + } + + match iter + .next() + .expect("should yield shared-memory input") + .expect("should open shared-memory input") + { + OpenedInput::SharedMemoryMap { data, offset } => { + assert_eq!(offset, 0); + assert_eq!(&data[..], b"shared memory input"); + } + other => panic!("expected shared memory map, got {other:?}"), + } + + match iter + .next() + .expect("should yield output snapshot") + .expect("should open output snapshot") + { + OpenedInput::SharedMemoryMap { data, offset } => { + assert_eq!(offset, 0); + assert_eq!(&data[..], b"output snapshot"); + } + other => panic!("expected output snapshot memory map, got {other:?}"), + } - let files = vec![tmpfile.path().as_os_str().to_os_string()]; - let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); - let count = inputs.into_iter().count(); - assert_eq!(count, 1); + assert!(iter.next().is_none()); } } diff --git a/src/uu/sort/src/sort_inputs/mod.rs b/src/uu/sort/src/sort_inputs/mod.rs index bcfb59ed5a0..74923e47860 100644 --- a/src/uu/sort/src/sort_inputs/mod.rs +++ b/src/uu/sort/src/sort_inputs/mod.rs @@ -1,4 +1,4 @@ mod iterator; mod model; -pub use model::{SortInput, SortInputInner, SortInputs}; +pub use model::{DeferredInput, InputAccess, OpenedInput, SortInput, SortInputs}; diff --git a/src/uu/sort/src/sort_inputs/model.rs b/src/uu/sort/src/sort_inputs/model.rs index 0ea4c7f357e..3916a126853 100644 --- a/src/uu/sort/src/sort_inputs/model.rs +++ b/src/uu/sort/src/sort_inputs/model.rs @@ -9,7 +9,7 @@ use std::{ collections::HashMap, - ffi::OsStr, + ffi::OsString, fs::File, io::{self, Read}, path::{Path, PathBuf}, @@ -19,76 +19,83 @@ use std::{ use memmap2::Mmap as MemoryMap; use uucore::error::UResult; -use crate::{STDIN_FILE, SortError}; +use crate::STDIN_FILE; -/// An inner enum representing the actual source of a sort input. +/// Deferred representation of a sort input before any file descriptor is opened. #[derive(Debug)] -pub enum SortInputInner { - /// An already-opened regular file. - File(File), - /// A memory-mapped file shared across duplicate paths, with an - /// independent cursor per instance. - FileRead { - data: Arc, - offset: usize, - }, +pub enum DeferredInput { Stdin, - /// A unique file whose open() is deferred until iteration. - LazyFile(PathBuf), + /// A file whose open() is deferred until iteration. + Path { + path: PathBuf, + access: InputAccess, + }, + /// Snapshot of the output file captured before it is opened for writing. + OutputSnapshot(Arc), } -/// Handle to a single sort input (file, memory-map, or stdin). +/// Describes how a deferred path should be opened when yielded by the iterator. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum InputAccess { + OpenFile, + SharedMemoryMap, +} + +/// Concrete input opened by `SortInputsIntoIter`; this is the type consumed by merge. #[derive(Debug)] -pub struct SortInput { - pub inner: SortInputInner, +pub enum OpenedInput { + Stdin, + File(File), + SharedMemoryMap { data: Arc, offset: usize }, } -impl SortInput { - fn read_inner(&mut self, buf: &mut [u8]) -> io::Result { - match &mut self.inner { - SortInputInner::File(file) => file.read(buf), - SortInputInner::FileRead { data, offset } => { +impl Read for OpenedInput { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self { + OpenedInput::File(file) => file.read(buf), + OpenedInput::SharedMemoryMap { data, offset } => { let pos = *offset; let available = data.len().saturating_sub(pos); let to_read = buf.len().min(available); + if to_read > 0 { buf[..to_read].copy_from_slice(&data[pos..pos + to_read]); *offset = pos + to_read; } + Ok(to_read) } - SortInputInner::Stdin => { + OpenedInput::Stdin => { let mut stdin = io::stdin(); stdin.read(buf) } - //All lazyFile are already opened in the SortInputsIntoIter next() method - SortInputInner::LazyFile(_) => unreachable!(), } } +} - fn from_memory_map(data: Arc) -> Self { - Self { - inner: SortInputInner::FileRead { data, offset: 0 }, - } - } +/// Handle to a single sort input (file, memory-map, or stdin). +#[derive(Debug)] +pub struct SortInput { + pub inner: DeferredInput, +} +impl SortInput { fn stdin() -> Self { Self { - inner: SortInputInner::Stdin, + inner: DeferredInput::Stdin, } } - fn from_path(file: &OsStr) -> Self { - let path = Path::new(file); + fn to_args_path(path: PathBuf, access: InputAccess) -> Self { Self { - inner: SortInputInner::LazyFile(path.to_path_buf()), + inner: DeferredInput::Path { path, access }, } } -} -impl Read for SortInput { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - self.read_inner(buf) + fn to_output(memory_map: Arc) -> Self { + Self { + inner: DeferredInput::OutputSnapshot(memory_map), + } } } @@ -105,11 +112,11 @@ pub struct SortInputs { impl SortInputs { /// Build a `SortInputs` from paths. /// - /// - Duplicate paths → one memory-map shared across all instances. - /// - Unique paths → stored as `LazyFile` and opened when the iterator yields them. - /// - `output_as_input`: pre-created memory-map used when the output file is also an input. + /// - Duplicate paths → stored as deferred paths marked for shared memory-map access. + /// - Unique paths → stored as deferred paths marked for regular file access. + /// - `output_as_input` → stored as a pre-created memory-map snapshot. pub fn from_files_with_output( - files: &[std::ffi::OsString], + files: &[OsString], output_as_input: Option<(PathBuf, Arc)>, ) -> UResult { let mut inputs = Vec::with_capacity(files.len()); @@ -125,12 +132,10 @@ impl SortInputs { } // Second pass: build inputs - // - Unique files: LazyFile (opened during iteration) - // - Duplicate files: memory-map once, share it - // - Output-as-input: use pre-created memory-map - // - Stdin: single occurrence - let mut opened_files: HashMap> = HashMap::new(); - + // - Unique files: deferred path opened as a regular file during iteration + // - Duplicate files: deferred path opened as a shared memory-map during iteration + // - Output-as-input: use pre-created memory-map snapshot + // - Stdin: preserve each occurrence for file in files { if file == STDIN_FILE { inputs.push(SortInput::stdin()); @@ -143,37 +148,25 @@ impl SortInputs { // Then use already opened fd for output if let Some((ref output_path, ref output_memory_map)) = output_as_input { if canonical == *output_path { - inputs.push(SortInput::from_memory_map(output_memory_map.clone())); + inputs.push(SortInput::to_output(output_memory_map.clone())); continue; } } // Unique file as input if *path_counts.get(&canonical).unwrap_or(&0) <= 1 { - inputs.push(SortInput::from_path(file)); + inputs.push(SortInput::to_args_path( + PathBuf::from(file), + InputAccess::OpenFile, + )); continue; - } - - // Duplicate file: use memory-map - let memory_map = if let Some(memory_map) = opened_files.get(&canonical) { - memory_map.clone() + // Duplicate input } else { - let f = File::open(path).map_err(|error| SortError::ReadFailed { - path: path.to_owned(), - error, - })?; - // SAFETY: We keep the file open for the lifetime of the memory-map, - // and we only read from it. The file is not modified. - let memory_map = Arc::new(unsafe { MemoryMap::map(&f) }.map_err(|error| { - SortError::ReadFailed { - path: path.to_owned(), - error, - } - })?); - opened_files.insert(canonical, memory_map.clone()); - memory_map - }; - inputs.push(SortInput::from_memory_map(memory_map)); + inputs.push(SortInput::to_args_path( + PathBuf::from(file), + InputAccess::SharedMemoryMap, + )); + } } Ok(Self { @@ -185,7 +178,7 @@ impl SortInputs { #[cfg(test)] mod tests { use super::*; - use std::ffi::OsString; + use std::ffi::{OsStr, OsString}; #[cfg(not(target_os = "wasi"))] use std::io::Write; #[cfg(not(target_os = "wasi"))] @@ -209,27 +202,26 @@ mod tests { /// Returns the number of unique sources (stdin + unique files + memory-map groups). pub fn unique_count(&self) -> usize { - let mut file_count = 0; let mut stdin_present = false; - - // Count unique memory-map instances by Arc pointer + let mut seen_paths = std::collections::HashSet::new(); let mut seen_memory_maps = std::collections::HashSet::new(); for input in &self.inputs { match &input.inner { - SortInputInner::Stdin => { + DeferredInput::Stdin => { stdin_present = true; } - SortInputInner::File(_) | SortInputInner::LazyFile(_) => { - file_count += 1; + DeferredInput::Path { path, .. } => { + let canonical = path.canonicalize().unwrap_or_else(|_| path.clone()); + seen_paths.insert(canonical); } - SortInputInner::FileRead { data, .. } => { + DeferredInput::OutputSnapshot(data) => { seen_memory_maps.insert(Arc::as_ptr(data)); } } } - file_count + seen_memory_maps.len() + usize::from(stdin_present) + seen_paths.len() + seen_memory_maps.len() + usize::from(stdin_present) } /// Iterate over the inputs without consuming them. @@ -247,21 +239,35 @@ mod tests { .expect("should write to temp file"); tmpfile.flush().expect("should flush temp file"); - let input = SortInput::from_path(tmpfile.path().as_os_str()); - assert!(!matches!(input.inner, SortInputInner::Stdin)); - } + let input = SortInput::to_args_path(tmpfile.path().to_path_buf(), InputAccess::OpenFile); + assert!(matches!( + input.inner, + DeferredInput::Path { + access: InputAccess::OpenFile, + .. + } + )); + } #[test] fn test_sort_input_new_stdin() { let input = SortInput::stdin(); - assert!(matches!(input.inner, SortInputInner::Stdin)); + assert!(matches!(input.inner, DeferredInput::Stdin)); } #[test] fn test_sort_input_new_missing_file() { let file = OsStr::new("/nonexistent/path/file.txt"); - let lazy_input = SortInput::from_path(&file); - assert!(matches!(lazy_input.inner, SortInputInner::LazyFile(_))); + + let lazy_input = SortInput::to_args_path(PathBuf::from(file), InputAccess::OpenFile); + + assert!(matches!( + lazy_input.inner, + DeferredInput::Path { + access: InputAccess::OpenFile, + .. + } + )); } #[test] @@ -276,8 +282,10 @@ mod tests { let file = File::open(tmpfile.path()).expect("should open temp file"); let memory_map = Arc::new(unsafe { MemoryMap::map(&file).expect("should memory_map temp file") }); - let mut input = SortInput::from_memory_map(memory_map); - + let mut input = OpenedInput::SharedMemoryMap { + data: memory_map, + offset: 0, + }; let mut buf = [0u8; 20]; let n = input.read(&mut buf).expect("should read from input"); assert_eq!(n, 20); @@ -295,7 +303,10 @@ mod tests { let file = File::open(tmpfile.path()).expect("should open temp file"); let mmap = Arc::new(unsafe { MemoryMap::map(&file).expect("should mmap temp file") }); - let input = SortInput::from_memory_map(mmap); + let input = OpenedInput::SharedMemoryMap { + data: mmap, + offset: 0, + }; let mut reader: Box = Box::new(input); let mut buf = [0u8; 9]; let n = reader @@ -317,8 +328,14 @@ mod tests { let file = File::open(tmpfile.path()).expect("should open temp file"); let mmap = Arc::new(unsafe { MemoryMap::map(&file).expect("should mmap temp file") }); - let mut input1 = SortInput::from_memory_map(mmap.clone()); - let mut input2 = SortInput::from_memory_map(mmap); + let mut input1 = OpenedInput::SharedMemoryMap { + data: mmap.clone(), + offset: 0, + }; + let mut input2 = OpenedInput::SharedMemoryMap { + data: mmap, + offset: 0, + }; // Both should be able to read independently let mut buf1 = [0u8; 11]; @@ -420,36 +437,17 @@ mod tests { ]; let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); - // Both inputs should be able to read independently - let mut buf1 = [0u8; 11]; - let mut input1 = SortInput { - inner: match &inputs.iter().next().expect("should get first input").inner { - SortInputInner::FileRead { data, offset } => SortInputInner::FileRead { - data: data.clone(), - offset: *offset, - }, - _ => panic!("Expected mmap"), - }, - }; - input1 - .read_exact(&mut buf1) - .expect("should read from first input"); - assert_eq!(&buf1, b"independent"); + assert_eq!(inputs.len(), 2); - let mut buf2 = [0u8; 11]; - let mut input2 = SortInput { - inner: match &inputs.iter().nth(1).expect("should get second input").inner { - SortInputInner::FileRead { data, offset } => SortInputInner::FileRead { - data: data.clone(), - offset: *offset, - }, - _ => panic!("Expected mmap"), - }, - }; - input2 - .read_exact(&mut buf2) - .expect("should read from second input"); - assert_eq!(&buf2, b"independent"); + for input in inputs.iter() { + assert!(matches!( + input.inner, + DeferredInput::Path { + access: InputAccess::SharedMemoryMap, + .. + } + )); + } } #[test] @@ -458,7 +456,7 @@ mod tests { let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); let input = inputs.iter().next().expect("should get first input"); assert_eq!(inputs.len(), 1); - assert!(matches!(input.inner, SortInputInner::Stdin)); + assert!(matches!(input.inner, DeferredInput::Stdin)); } #[test] From d651d9a348a5d6c974516eb3b48c8f55d6ce5740 Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Mon, 1 Jun 2026 17:48:52 +0200 Subject: [PATCH 18/19] chore: clippy feedback --- src/uu/sort/src/merge.rs | 5 ++-- src/uu/sort/src/sort_inputs/model.rs | 44 +++++++++++++--------------- 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index b697738e531..9289b541929 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -113,7 +113,7 @@ pub fn merge( ) -> UResult<()> { let output_as_input = load_output_as_input(&output, files)?; - let sort_inputs = SortInputs::from_files_with_output(files, output_as_input)?; + let sort_inputs = SortInputs::from_files_with_output(files, output_as_input); let files = sort_inputs .into_iter() .map(|result| result.map(|input| PlainMergeInput:: { inner: input })); @@ -651,8 +651,7 @@ mod tests { let output_as_input = Arc::new(unsafe { MemoryMap::map(&read_fd).unwrap() }); let sort_inputs = - SortInputs::from_files_with_output(&files, Some((output_canon, output_as_input))) - .unwrap(); + SortInputs::from_files_with_output(&files, Some((output_canon, output_as_input))); assert_eq!(sort_inputs.len(), 7); assert_eq!(sort_inputs.unique_count(), 6); diff --git a/src/uu/sort/src/sort_inputs/model.rs b/src/uu/sort/src/sort_inputs/model.rs index 3916a126853..1ea23289bf1 100644 --- a/src/uu/sort/src/sort_inputs/model.rs +++ b/src/uu/sort/src/sort_inputs/model.rs @@ -16,10 +16,8 @@ use std::{ sync::Arc, }; -use memmap2::Mmap as MemoryMap; -use uucore::error::UResult; - use crate::STDIN_FILE; +use memmap2::Mmap as MemoryMap; /// Deferred representation of a sort input before any file descriptor is opened. #[derive(Debug)] @@ -30,7 +28,6 @@ pub enum DeferredInput { path: PathBuf, access: InputAccess, }, - /// Snapshot of the output file captured before it is opened for writing. OutputSnapshot(Arc), } @@ -52,8 +49,8 @@ pub enum OpenedInput { impl Read for OpenedInput { fn read(&mut self, buf: &mut [u8]) -> io::Result { match self { - OpenedInput::File(file) => file.read(buf), - OpenedInput::SharedMemoryMap { data, offset } => { + Self::File(file) => file.read(buf), + Self::SharedMemoryMap { data, offset } => { let pos = *offset; let available = data.len().saturating_sub(pos); let to_read = buf.len().min(available); @@ -65,7 +62,7 @@ impl Read for OpenedInput { Ok(to_read) } - OpenedInput::Stdin => { + Self::Stdin => { let mut stdin = io::stdin(); stdin.read(buf) } @@ -118,7 +115,7 @@ impl SortInputs { pub fn from_files_with_output( files: &[OsString], output_as_input: Option<(PathBuf, Arc)>, - ) -> UResult { + ) -> Self { let mut inputs = Vec::with_capacity(files.len()); // First pass: count occurrences of each path to identify duplicates @@ -159,7 +156,6 @@ impl SortInputs { PathBuf::from(file), InputAccess::OpenFile, )); - continue; // Duplicate input } else { inputs.push(SortInput::to_args_path( @@ -169,9 +165,9 @@ impl SortInputs { } } - Ok(Self { + Self { inputs: inputs.into_iter().collect(), - }) + } } } @@ -186,7 +182,7 @@ mod tests { // Util method for SortInputs in test impl SortInputs { - pub fn from_files(files: &[OsString]) -> UResult { + pub fn from_files(files: &[OsString]) -> Self { Self::from_files_with_output(files, None) } @@ -353,7 +349,7 @@ mod tests { #[test] fn test_sort_inputs_empty() { - let inputs = SortInputs::from_files(&[]).expect("should build empty sort inputs"); + let inputs = SortInputs::from_files(&[]); assert_eq!(inputs.len(), 0); assert!(inputs.is_empty()); } @@ -368,7 +364,7 @@ mod tests { tmpfile.flush().expect("should flush temp file"); let files = vec![tmpfile.path().as_os_str().to_os_string()]; - let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + let inputs = SortInputs::from_files(&files); assert_eq!(inputs.len(), 1); assert_eq!(inputs.unique_count(), 1); } @@ -394,7 +390,7 @@ mod tests { tmpfile2.path().as_os_str().to_os_string(), tmpfile3.path().as_os_str().to_os_string(), ]; - let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + let inputs = SortInputs::from_files(&files); assert_eq!(inputs.len(), 3); assert_eq!(inputs.unique_count(), 3); } @@ -416,7 +412,7 @@ mod tests { tmpfile1.path().as_os_str().to_os_string(), tmpfile2.path().as_os_str().to_os_string(), ]; - let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + let inputs = SortInputs::from_files(&files); assert_eq!(inputs.len(), 3); // 2 unique: file1 (mmap) and file2 (direct) assert_eq!(inputs.unique_count(), 2); @@ -435,7 +431,7 @@ mod tests { tmpfile.path().as_os_str().to_os_string(), tmpfile.path().as_os_str().to_os_string(), ]; - let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + let inputs = SortInputs::from_files(&files); assert_eq!(inputs.len(), 2); @@ -453,7 +449,7 @@ mod tests { #[test] fn test_sort_inputs_stdin_only() { let files = vec![OsString::from("-")]; - let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + let inputs = SortInputs::from_files(&files); let input = inputs.iter().next().expect("should get first input"); assert_eq!(inputs.len(), 1); assert!(matches!(input.inner, DeferredInput::Stdin)); @@ -463,8 +459,8 @@ mod tests { fn test_sort_inputs_duplicate_stdin_allowed() { // Verify that duplicate stdin is allowed (GNU Coreutils compatible) let files = vec![OsString::from("-"), OsString::from("-")]; - let result = SortInputs::from_files(&files); - assert!(result.is_ok()); + let inputs = SortInputs::from_files(&files); + assert_eq!(inputs.len(), files.len()); } #[test] @@ -480,8 +476,8 @@ mod tests { OsString::from("-"), tmpfile.path().as_os_str().to_os_string(), ]; - let result = SortInputs::from_files(&files); - assert!(result.is_ok()); + let inputs = SortInputs::from_files(&files); + assert_eq!(inputs.len(), files.len()); } #[test] @@ -500,7 +496,7 @@ mod tests { tmpfile2.path().as_os_str().to_os_string(), tmpfile1.path().as_os_str().to_os_string(), ]; - let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + let inputs = SortInputs::from_files(&files); let collected: Vec<_> = inputs.iter().collect(); assert_eq!(collected.len(), 2); } @@ -517,7 +513,7 @@ mod tests { tmpfile.path().as_os_str().to_os_string(), OsString::from("/nonexistent/path/file.txt"), ]; - let inputs = SortInputs::from_files(&files).expect("should build sort inputs"); + let inputs = SortInputs::from_files(&files); let mut iter = inputs.into_iter(); assert!(iter.next().expect("should get first input").is_ok()); // first file opens successfully assert!(iter.next().expect("should get second input").is_err()); // second file fails to open From 6d51823edab848f91e7e34390eb6b8a41adc5df3 Mon Sep 17 00:00:00 2001 From: Your Name <513142+nonontb@users.noreply.github.com> Date: Mon, 1 Jun 2026 17:59:00 +0200 Subject: [PATCH 19/19] Fix: Bad fuzz/Cargo.lock merge --- fuzz/Cargo.lock | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 66cc4dc9069..4ba4c2d6238 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -2059,11 +2059,7 @@ dependencies = [ "itertools", "libc", "memchr", -<<<<<<< HEAD -======= "memmap2", - "nix", ->>>>>>> 7d0a0137f (chore: Update fuz cargo.lock) "rand", "rayon", "rustix",