From 849086e9c5c2dd24ca956e89a255cafa7c494cb2 Mon Sep 17 00:00:00 2001 From: Michael Debertol Date: Sat, 31 Jul 2021 20:12:38 +0200 Subject: [PATCH] sort: handle cases where the output file is also an input file In such cases we have to create a temporary copy of the input file to prevent overwriting the input with the output. This only affects merge sort, because it is the only mode where we start writing to the output before having read all inputs. --- src/uu/sort/src/check.rs | 17 ++++++++-- src/uu/sort/src/merge.rs | 59 ++++++++++++++++++++++++++------ src/uu/sort/src/sort.rs | 69 +++++++++++++++++++++++++------------- tests/by-util/test_sort.rs | 10 ++++++ 4 files changed, 119 insertions(+), 36 deletions(-) diff --git a/src/uu/sort/src/check.rs b/src/uu/sort/src/check.rs index de320ef77..350a98e23 100644 --- a/src/uu/sort/src/check.rs +++ b/src/uu/sort/src/check.rs @@ -14,6 +14,7 @@ use crate::{ use itertools::Itertools; use std::{ cmp::Ordering, + ffi::OsStr, io::Read, iter, sync::mpsc::{sync_channel, Receiver, SyncSender}, @@ -25,7 +26,7 @@ use std::{ /// # Returns /// /// The code we should exit with. -pub fn check(path: &str, settings: &GlobalSettings) -> i32 { +pub fn check(path: &OsStr, settings: &GlobalSettings) -> i32 { let max_allowed_cmp = if settings.unique { // If `unique` is enabled, the previous line must compare _less_ to the next one. Ordering::Less @@ -63,7 +64,12 @@ pub fn check(path: &str, settings: &GlobalSettings) -> i32 { ) > max_allowed_cmp { if !settings.check_silent { - eprintln!("sort: {}:{}: disorder: {}", path, line_idx, new_first.line); + eprintln!( + "sort: {}:{}: disorder: {}", + path.to_string_lossy(), + line_idx, + new_first.line + ); } return 1; } @@ -74,7 +80,12 @@ pub fn check(path: &str, settings: &GlobalSettings) -> i32 { line_idx += 1; if compare_by(a, b, settings, chunk.line_data(), chunk.line_data()) > max_allowed_cmp { if !settings.check_silent { - eprintln!("sort: {}:{}: disorder: {}", path, line_idx, b.line); + eprintln!( + "sort: {}:{}: disorder: {}", + path.to_string_lossy(), + line_idx, + b.line + ); } return 1; } diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index b8d69fb14..4a45028f7 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -9,10 +9,11 @@ use std::{ cmp::Ordering, + ffi::OsString, fs::{self, File}, io::{BufWriter, Read, Write}, iter, - path::PathBuf, + path::{Path, PathBuf}, process::{Child, ChildStdin, ChildStdout, Command, Stdio}, rc::Rc, sync::mpsc::{channel, sync_channel, Receiver, Sender, SyncSender}, @@ -25,28 +26,66 @@ use tempfile::TempDir; use crate::{ chunks::{self, Chunk, RecycledChunk}, - compare_by, GlobalSettings, Output, + compare_by, open, GlobalSettings, Output, }; +/// If the output file occurs in the input files as well, copy the contents of the output file +/// and replace its occurrences in the inputs with that copy. +fn replace_output_file_in_input_files( + files: &mut [OsString], + settings: &GlobalSettings, + output: Option<&str>, +) -> Option<(TempDir, usize)> { + let mut copy: Option<(TempDir, PathBuf)> = None; + if let Some(Ok(output_path)) = output.map(|path| Path::new(path).canonicalize()) { + for file in files { + if let Ok(file_path) = Path::new(file).canonicalize() { + if file_path == output_path { + if let Some((_dir, copy)) = © { + *file = copy.clone().into_os_string(); + } else { + let tmp_dir = tempfile::Builder::new() + .prefix("uutils_sort") + .tempdir_in(&settings.tmp_dir) + .unwrap(); + let copy_path = tmp_dir.path().join("0"); + std::fs::copy(file_path, ©_path).unwrap(); + *file = copy_path.clone().into_os_string(); + copy = Some((tmp_dir, copy_path)) + } + } + } + } + } + // if we created a TempDir its size must be one. + copy.map(|(dir, _copy)| (dir, 1)) +} + /// Merge pre-sorted `Box`s. /// /// If `settings.merge_batch_size` is greater than the length of `files`, intermediate files will be used. /// If `settings.compress_prog` is `Some`, intermediate files will be compressed with it. -pub fn merge>>( - files: Files, - settings: &GlobalSettings, -) -> FileMerger { +pub fn merge<'a>( + files: &mut [OsString], + settings: &'a GlobalSettings, + output: Option<&str>, +) -> FileMerger<'a> { + let tmp_dir = replace_output_file_in_input_files(files, settings, output); if settings.compress_prog.is_none() { merge_with_file_limit::<_, _, WriteablePlainTmpFile>( - files.map(|file| PlainMergeInput { inner: file }), + files + .iter() + .map(|file| PlainMergeInput { inner: open(file) }), settings, - None, + tmp_dir, ) } else { merge_with_file_limit::<_, _, WriteableCompressedTmpFile>( - files.map(|file| PlainMergeInput { inner: file }), + files + .iter() + .map(|file| PlainMergeInput { inner: open(file) }), settings, - None, + tmp_dir, ) } } diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index aa4d483c0..1f674e549 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -33,8 +33,8 @@ use rand::{thread_rng, Rng}; use rayon::prelude::*; use std::cmp::Ordering; use std::env; -use std::ffi::OsStr; -use std::fs::File; +use std::ffi::{OsStr, OsString}; +use std::fs::{File, OpenOptions}; use std::hash::{Hash, Hasher}; use std::io::{stdin, stdout, BufRead, BufReader, Read, Write}; use std::ops::Range; @@ -146,26 +146,46 @@ impl SortMode { } pub struct Output { - file: Option, + file: Option<(String, File)>, } impl Output { fn new(name: Option<&str>) -> Self { Self { file: name.map(|name| { - File::create(name).unwrap_or_else(|e| { - crash!(2, "open failed: {}: {}", name, strip_errno(&e.to_string())) - }) + // This is different from `File::create()` because we don't truncate the output yet. + // This allows using the output file as an input file. + ( + name.to_owned(), + OpenOptions::new() + .write(true) + .create(true) + .open(name) + .unwrap_or_else(|e| { + crash!(2, "open failed: {}: {}", name, strip_errno(&e.to_string())) + }), + ) }), } } fn into_write(self) -> Box { match self.file { - Some(file) => Box::new(file), + Some((_name, file)) => { + // truncate the file + file.set_len(0).unwrap(); + Box::new(file) + } None => Box::new(stdout()), } } + + fn as_output_name(&self) -> Option<&str> { + match &self.file { + Some((name, _file)) => Some(name), + None => None, + } + } } #[derive(Clone)] @@ -956,29 +976,28 @@ pub fn uumain(args: impl uucore::Args) -> i32 { settings.debug = matches.is_present(options::DEBUG); // check whether user specified a zero terminated list of files for input, otherwise read files from args - let mut files: Vec = if matches.is_present(options::FILES0_FROM) { - let files0_from: Vec = matches - .values_of(options::FILES0_FROM) - .map(|v| v.map(ToString::to_string).collect()) + let mut files: Vec = if matches.is_present(options::FILES0_FROM) { + let files0_from: Vec = matches + .values_of_os(options::FILES0_FROM) + .map(|v| v.map(ToOwned::to_owned).collect()) .unwrap_or_default(); let mut files = Vec::new(); for path in &files0_from { - let reader = open(path.as_str()); + let reader = open(&path); let buf_reader = BufReader::new(reader); for line in buf_reader.split(b'\0').flatten() { - files.push( + files.push(OsString::from( std::str::from_utf8(&line) - .expect("Could not parse string from zero terminated input.") - .to_string(), - ); + .expect("Could not parse string from zero terminated input."), + )); } } files } else { matches - .values_of(options::FILES) - .map(|v| v.map(ToString::to_string).collect()) + .values_of_os(options::FILES) + .map(|v| v.map(ToOwned::to_owned).collect()) .unwrap_or_default() }; @@ -1066,9 +1085,13 @@ pub fn uumain(args: impl uucore::Args) -> i32 { if files.is_empty() { /* if no file, default to stdin */ - files.push("-".to_owned()); + files.push("-".to_string().into()); } else if settings.check && files.len() != 1 { - crash!(2, "extra operand `{}' not allowed with -c", files[1]) + crash!( + 2, + "extra operand `{}' not allowed with -c", + files[1].to_string_lossy() + ) } if let Some(arg) = matches.args.get(options::SEPARATOR) { @@ -1122,7 +1145,7 @@ pub fn uumain(args: impl uucore::Args) -> i32 { settings.init_precomputed(); - exec(&files, &settings, output) + exec(&mut files, &settings, output) } pub fn uu_app() -> App<'static, 'static> { @@ -1345,9 +1368,9 @@ pub fn uu_app() -> App<'static, 'static> { ) } -fn exec(files: &[String], settings: &GlobalSettings, output: Output) -> i32 { +fn exec(files: &mut [OsString], settings: &GlobalSettings, output: Output) -> i32 { if settings.merge { - let mut file_merger = merge::merge(files.iter().map(open), settings); + let mut file_merger = merge::merge(files, settings, output.as_output_name()); file_merger.write_all(settings, output); } else if settings.check { if files.len() > 1 { diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index b9cdb2b80..4c4a7a697 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -1020,3 +1020,13 @@ fn test_separator_null() { .succeeds() .stdout_only("a\0z\0z\nz\0b\0a\nz\0a\0b\n"); } + +#[test] +fn test_output_is_input() { + let input = "a\nb\nc\n"; + let (at, mut cmd) = at_and_ucmd!(); + at.touch("file"); + at.append("file", input); + cmd.args(&["-m", "-o", "file", "file"]).succeeds(); + assert_eq!(at.read("file"), input); +}