From 849086e9c5c2dd24ca956e89a255cafa7c494cb2 Mon Sep 17 00:00:00 2001 From: Michael Debertol Date: Sat, 31 Jul 2021 20:12:38 +0200 Subject: [PATCH 1/7] sort: handle cases where the output file is also an input file In such cases we have to create a temporary copy of the input file to prevent overwriting the input with the output. This only affects merge sort, because it is the only mode where we start writing to the output before having read all inputs. --- src/uu/sort/src/check.rs | 17 ++++++++-- src/uu/sort/src/merge.rs | 59 ++++++++++++++++++++++++++------ src/uu/sort/src/sort.rs | 69 +++++++++++++++++++++++++------------- tests/by-util/test_sort.rs | 10 ++++++ 4 files changed, 119 insertions(+), 36 deletions(-) diff --git a/src/uu/sort/src/check.rs b/src/uu/sort/src/check.rs index de320ef77..350a98e23 100644 --- a/src/uu/sort/src/check.rs +++ b/src/uu/sort/src/check.rs @@ -14,6 +14,7 @@ use crate::{ use itertools::Itertools; use std::{ cmp::Ordering, + ffi::OsStr, io::Read, iter, sync::mpsc::{sync_channel, Receiver, SyncSender}, @@ -25,7 +26,7 @@ use std::{ /// # Returns /// /// The code we should exit with. -pub fn check(path: &str, settings: &GlobalSettings) -> i32 { +pub fn check(path: &OsStr, settings: &GlobalSettings) -> i32 { let max_allowed_cmp = if settings.unique { // If `unique` is enabled, the previous line must compare _less_ to the next one. Ordering::Less @@ -63,7 +64,12 @@ pub fn check(path: &str, settings: &GlobalSettings) -> i32 { ) > max_allowed_cmp { if !settings.check_silent { - eprintln!("sort: {}:{}: disorder: {}", path, line_idx, new_first.line); + eprintln!( + "sort: {}:{}: disorder: {}", + path.to_string_lossy(), + line_idx, + new_first.line + ); } return 1; } @@ -74,7 +80,12 @@ pub fn check(path: &str, settings: &GlobalSettings) -> i32 { line_idx += 1; if compare_by(a, b, settings, chunk.line_data(), chunk.line_data()) > max_allowed_cmp { if !settings.check_silent { - eprintln!("sort: {}:{}: disorder: {}", path, line_idx, b.line); + eprintln!( + "sort: {}:{}: disorder: {}", + path.to_string_lossy(), + line_idx, + b.line + ); } return 1; } diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index b8d69fb14..4a45028f7 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -9,10 +9,11 @@ use std::{ cmp::Ordering, + ffi::OsString, fs::{self, File}, io::{BufWriter, Read, Write}, iter, - path::PathBuf, + path::{Path, PathBuf}, process::{Child, ChildStdin, ChildStdout, Command, Stdio}, rc::Rc, sync::mpsc::{channel, sync_channel, Receiver, Sender, SyncSender}, @@ -25,28 +26,66 @@ use tempfile::TempDir; use crate::{ chunks::{self, Chunk, RecycledChunk}, - compare_by, GlobalSettings, Output, + compare_by, open, GlobalSettings, Output, }; +/// If the output file occurs in the input files as well, copy the contents of the output file +/// and replace its occurrences in the inputs with that copy. +fn replace_output_file_in_input_files( + files: &mut [OsString], + settings: &GlobalSettings, + output: Option<&str>, +) -> Option<(TempDir, usize)> { + let mut copy: Option<(TempDir, PathBuf)> = None; + if let Some(Ok(output_path)) = output.map(|path| Path::new(path).canonicalize()) { + for file in files { + if let Ok(file_path) = Path::new(file).canonicalize() { + if file_path == output_path { + if let Some((_dir, copy)) = © { + *file = copy.clone().into_os_string(); + } else { + let tmp_dir = tempfile::Builder::new() + .prefix("uutils_sort") + .tempdir_in(&settings.tmp_dir) + .unwrap(); + let copy_path = tmp_dir.path().join("0"); + std::fs::copy(file_path, ©_path).unwrap(); + *file = copy_path.clone().into_os_string(); + copy = Some((tmp_dir, copy_path)) + } + } + } + } + } + // if we created a TempDir its size must be one. + copy.map(|(dir, _copy)| (dir, 1)) +} + /// Merge pre-sorted `Box`s. /// /// If `settings.merge_batch_size` is greater than the length of `files`, intermediate files will be used. /// If `settings.compress_prog` is `Some`, intermediate files will be compressed with it. -pub fn merge>>( - files: Files, - settings: &GlobalSettings, -) -> FileMerger { +pub fn merge<'a>( + files: &mut [OsString], + settings: &'a GlobalSettings, + output: Option<&str>, +) -> FileMerger<'a> { + let tmp_dir = replace_output_file_in_input_files(files, settings, output); if settings.compress_prog.is_none() { merge_with_file_limit::<_, _, WriteablePlainTmpFile>( - files.map(|file| PlainMergeInput { inner: file }), + files + .iter() + .map(|file| PlainMergeInput { inner: open(file) }), settings, - None, + tmp_dir, ) } else { merge_with_file_limit::<_, _, WriteableCompressedTmpFile>( - files.map(|file| PlainMergeInput { inner: file }), + files + .iter() + .map(|file| PlainMergeInput { inner: open(file) }), settings, - None, + tmp_dir, ) } } diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index aa4d483c0..1f674e549 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -33,8 +33,8 @@ use rand::{thread_rng, Rng}; use rayon::prelude::*; use std::cmp::Ordering; use std::env; -use std::ffi::OsStr; -use std::fs::File; +use std::ffi::{OsStr, OsString}; +use std::fs::{File, OpenOptions}; use std::hash::{Hash, Hasher}; use std::io::{stdin, stdout, BufRead, BufReader, Read, Write}; use std::ops::Range; @@ -146,26 +146,46 @@ impl SortMode { } pub struct Output { - file: Option, + file: Option<(String, File)>, } impl Output { fn new(name: Option<&str>) -> Self { Self { file: name.map(|name| { - File::create(name).unwrap_or_else(|e| { - crash!(2, "open failed: {}: {}", name, strip_errno(&e.to_string())) - }) + // This is different from `File::create()` because we don't truncate the output yet. + // This allows using the output file as an input file. + ( + name.to_owned(), + OpenOptions::new() + .write(true) + .create(true) + .open(name) + .unwrap_or_else(|e| { + crash!(2, "open failed: {}: {}", name, strip_errno(&e.to_string())) + }), + ) }), } } fn into_write(self) -> Box { match self.file { - Some(file) => Box::new(file), + Some((_name, file)) => { + // truncate the file + file.set_len(0).unwrap(); + Box::new(file) + } None => Box::new(stdout()), } } + + fn as_output_name(&self) -> Option<&str> { + match &self.file { + Some((name, _file)) => Some(name), + None => None, + } + } } #[derive(Clone)] @@ -956,29 +976,28 @@ pub fn uumain(args: impl uucore::Args) -> i32 { settings.debug = matches.is_present(options::DEBUG); // check whether user specified a zero terminated list of files for input, otherwise read files from args - let mut files: Vec = if matches.is_present(options::FILES0_FROM) { - let files0_from: Vec = matches - .values_of(options::FILES0_FROM) - .map(|v| v.map(ToString::to_string).collect()) + let mut files: Vec = if matches.is_present(options::FILES0_FROM) { + let files0_from: Vec = matches + .values_of_os(options::FILES0_FROM) + .map(|v| v.map(ToOwned::to_owned).collect()) .unwrap_or_default(); let mut files = Vec::new(); for path in &files0_from { - let reader = open(path.as_str()); + let reader = open(&path); let buf_reader = BufReader::new(reader); for line in buf_reader.split(b'\0').flatten() { - files.push( + files.push(OsString::from( std::str::from_utf8(&line) - .expect("Could not parse string from zero terminated input.") - .to_string(), - ); + .expect("Could not parse string from zero terminated input."), + )); } } files } else { matches - .values_of(options::FILES) - .map(|v| v.map(ToString::to_string).collect()) + .values_of_os(options::FILES) + .map(|v| v.map(ToOwned::to_owned).collect()) .unwrap_or_default() }; @@ -1066,9 +1085,13 @@ pub fn uumain(args: impl uucore::Args) -> i32 { if files.is_empty() { /* if no file, default to stdin */ - files.push("-".to_owned()); + files.push("-".to_string().into()); } else if settings.check && files.len() != 1 { - crash!(2, "extra operand `{}' not allowed with -c", files[1]) + crash!( + 2, + "extra operand `{}' not allowed with -c", + files[1].to_string_lossy() + ) } if let Some(arg) = matches.args.get(options::SEPARATOR) { @@ -1122,7 +1145,7 @@ pub fn uumain(args: impl uucore::Args) -> i32 { settings.init_precomputed(); - exec(&files, &settings, output) + exec(&mut files, &settings, output) } pub fn uu_app() -> App<'static, 'static> { @@ -1345,9 +1368,9 @@ pub fn uu_app() -> App<'static, 'static> { ) } -fn exec(files: &[String], settings: &GlobalSettings, output: Output) -> i32 { +fn exec(files: &mut [OsString], settings: &GlobalSettings, output: Output) -> i32 { if settings.merge { - let mut file_merger = merge::merge(files.iter().map(open), settings); + let mut file_merger = merge::merge(files, settings, output.as_output_name()); file_merger.write_all(settings, output); } else if settings.check { if files.len() > 1 { diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index b9cdb2b80..4c4a7a697 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -1020,3 +1020,13 @@ fn test_separator_null() { .succeeds() .stdout_only("a\0z\0z\nz\0b\0a\nz\0a\0b\n"); } + +#[test] +fn test_output_is_input() { + let input = "a\nb\nc\n"; + let (at, mut cmd) = at_and_ucmd!(); + at.touch("file"); + at.append("file", input); + cmd.args(&["-m", "-o", "file", "file"]).succeeds(); + assert_eq!(at.read("file"), input); +} From f29239beecacadd085ef2f8d4e8bb96a47f20466 Mon Sep 17 00:00:00 2001 From: Michael Debertol Date: Sat, 31 Jul 2021 16:15:22 +0200 Subject: [PATCH 2/7] sort: buffer writes to the output This fixes a regression from a33b6d87b578990893257c2b063f8579a3f960c4 --- src/uu/sort/src/sort.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 1f674e549..13c2c3cfa 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -36,7 +36,7 @@ use std::env; use std::ffi::{OsStr, OsString}; use std::fs::{File, OpenOptions}; use std::hash::{Hash, Hasher}; -use std::io::{stdin, stdout, BufRead, BufReader, Read, Write}; +use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Write}; use std::ops::Range; use std::path::Path; use std::path::PathBuf; @@ -169,15 +169,15 @@ impl Output { } } - fn into_write(self) -> Box { - match self.file { + fn into_write(self) -> BufWriter> { + BufWriter::new(match self.file { Some((_name, file)) => { // truncate the file file.set_len(0).unwrap(); Box::new(file) } None => Box::new(stdout()), - } + }) } fn as_output_name(&self) -> Option<&str> { From 5bf4536bdde00f1268e3172d6aad3ef86c56f8e2 Mon Sep 17 00:00:00 2001 From: Michael Debertol Date: Sat, 31 Jul 2021 19:40:38 +0200 Subject: [PATCH 3/7] sort: ignore failure to truncate the output file --- src/uu/sort/src/sort.rs | 2 +- tests/by-util/test_sort.rs | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 13c2c3cfa..7c855ab19 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -173,7 +173,7 @@ impl Output { BufWriter::new(match self.file { Some((_name, file)) => { // truncate the file - file.set_len(0).unwrap(); + let _ = file.set_len(0); Box::new(file) } None => Box::new(stdout()), diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index 4c4a7a697..36bed4b94 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -1030,3 +1030,12 @@ fn test_output_is_input() { cmd.args(&["-m", "-o", "file", "file"]).succeeds(); assert_eq!(at.read("file"), input); } + +#[test] +#[cfg(unix)] +fn test_output_device() { + new_ucmd!() + .args(&["-o", "/dev/null"]) + .pipe_in("input") + .succeeds(); +} From 418f5b7692f3e487c875a4b6252e04e8b00a3130 Mon Sep 17 00:00:00 2001 From: Michael Debertol Date: Sat, 31 Jul 2021 20:19:11 +0200 Subject: [PATCH 4/7] sort: handle empty merge inputs --- src/uu/sort/src/merge.rs | 14 ++++++++------ tests/by-util/test_sort.rs | 9 +++++++++ tests/fixtures/sort/empty.txt | 0 3 files changed, 17 insertions(+), 6 deletions(-) create mode 100644 tests/fixtures/sort/empty.txt diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index 4a45028f7..a5ac9411b 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -194,12 +194,14 @@ fn merge_without_limit>( let mut mergeable_files = vec![]; for (file_number, receiver) in loaded_receivers.into_iter().enumerate() { - mergeable_files.push(MergeableFile { - current_chunk: Rc::new(receiver.recv().unwrap()), - file_number, - line_idx: 0, - receiver, - }) + if let Ok(chunk) = receiver.recv() { + mergeable_files.push(MergeableFile { + current_chunk: Rc::new(chunk), + file_number, + line_idx: 0, + receiver, + }) + } } FileMerger { diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index 36bed4b94..8e9861a39 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -1039,3 +1039,12 @@ fn test_output_device() { .pipe_in("input") .succeeds(); } + +#[test] +fn test_merge_empty_input() { + new_ucmd!() + .args(&["-m", "empty.txt"]) + .succeeds() + .no_stderr() + .no_stdout(); +} diff --git a/tests/fixtures/sort/empty.txt b/tests/fixtures/sort/empty.txt new file mode 100644 index 000000000..e69de29bb From 3d23ace9b8c02d0e06311db68ecec434c9ed32ac Mon Sep 17 00:00:00 2001 From: Michael Debertol Date: Sun, 1 Aug 2021 17:12:06 +0200 Subject: [PATCH 5/7] sort: remove redundant comment --- src/uu/sort/src/ext_sort.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/uu/sort/src/ext_sort.rs b/src/uu/sort/src/ext_sort.rs index ea53900e2..816bf1e1d 100644 --- a/src/uu/sort/src/ext_sort.rs +++ b/src/uu/sort/src/ext_sort.rs @@ -89,8 +89,6 @@ fn reader_writer>, Tmp: WriteableTmpFile files, &settings.tmp_dir, separator, - // Heuristically chosen: Dividing by 10 seems to keep our memory usage roughly - // around settings.buffer_size as a whole. buffer_size, settings, receiver, From af47c66a00c0912e58d346cd3a26896a8db075c1 Mon Sep 17 00:00:00 2001 From: Michael Debertol Date: Sun, 1 Aug 2021 17:12:35 +0200 Subject: [PATCH 6/7] sort: improve tests --- src/uu/sort/src/check.rs | 8 +++++++- tests/by-util/test_sort.rs | 27 ++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/uu/sort/src/check.rs b/src/uu/sort/src/check.rs index 350a98e23..d82565c3d 100644 --- a/src/uu/sort/src/check.rs +++ b/src/uu/sort/src/check.rs @@ -42,7 +42,13 @@ pub fn check(path: &OsStr, settings: &GlobalSettings) -> i32 { move || reader(file, recycled_receiver, loaded_sender, &settings) }); for _ in 0..2 { - let _ = recycled_sender.send(RecycledChunk::new(100 * 1024)); + let _ = recycled_sender.send(RecycledChunk::new(if settings.buffer_size < 100 * 1024 { + // when the buffer size is smaller than 100KiB we choose it instead of the default. + // this improves testability. + settings.buffer_size + } else { + 100 * 1024 + })); } let mut prev_chunk: Option = None; diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index 838eb4f98..cac9a5a5d 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -771,6 +771,7 @@ fn test_check() { new_ucmd!() .arg(diagnose_arg) .arg("check_fail.txt") + .arg("--buffer-size=10b") .fails() .stderr_only("sort: check_fail.txt:6: disorder: 5\n"); @@ -892,6 +893,29 @@ fn test_compress() { .stdout_only_fixture("ext_sort.expected"); } +#[test] +#[cfg(target_os = "linux")] +fn test_compress_merge() { + new_ucmd!() + .args(&[ + "--compress-program", + "gzip", + "-S", + "10", + "--batch-size=2", + "-m", + "--unique", + "merge_ints_interleaved_1.txt", + "merge_ints_interleaved_2.txt", + "merge_ints_interleaved_3.txt", + "merge_ints_interleaved_3.txt", + "merge_ints_interleaved_2.txt", + "merge_ints_interleaved_1.txt", + ]) + .succeeds() + .stdout_only_fixture("merge_ints_interleaved.expected"); +} + #[test] fn test_compress_fail() { TestScenario::new(util_name!()) @@ -1027,7 +1051,8 @@ fn test_output_is_input() { let (at, mut cmd) = at_and_ucmd!(); at.touch("file"); at.append("file", input); - cmd.args(&["-m", "-o", "file", "file"]).succeeds(); + cmd.args(&["-m", "-u", "-o", "file", "file", "file", "file"]) + .succeeds(); assert_eq!(at.read("file"), input); } From 450a487d763b299856a1d8ff4dfb2a753c6ec711 Mon Sep 17 00:00:00 2001 From: Michael Debertol Date: Sun, 1 Aug 2021 18:02:52 +0200 Subject: [PATCH 7/7] sort: ignore broken pipes in a test Since sort exits early due to the nonexistent file, it might no longer be around when we try to send it the input. This is "by design" and can be ignored. --- tests/by-util/test_sort.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index cac9a5a5d..1f21722ed 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -1000,6 +1000,7 @@ fn test_verifies_out_file() { new_ucmd!() .args(&["-o", "nonexistent_dir/nonexistent_file"]) .pipe_in(input) + .ignore_stdin_write_error() .fails() .status_code(2) .stderr_only(