1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-29 12:07:46 +00:00

sort: handle cases where the output file is also an input file

In such cases we have to create a temporary copy of the input file to prevent
overwriting the input with the output. This only affects merge sort, because it
is the only mode where we start writing to the output before having read all inputs.
This commit is contained in:
Michael Debertol 2021-07-31 20:12:38 +02:00
parent 103a9d52ff
commit 849086e9c5
4 changed files with 119 additions and 36 deletions

View file

@ -14,6 +14,7 @@ use crate::{
use itertools::Itertools;
use std::{
cmp::Ordering,
ffi::OsStr,
io::Read,
iter,
sync::mpsc::{sync_channel, Receiver, SyncSender},
@ -25,7 +26,7 @@ use std::{
/// # Returns
///
/// The code we should exit with.
pub fn check(path: &str, settings: &GlobalSettings) -> i32 {
pub fn check(path: &OsStr, settings: &GlobalSettings) -> i32 {
let max_allowed_cmp = if settings.unique {
// If `unique` is enabled, the previous line must compare _less_ to the next one.
Ordering::Less
@ -63,7 +64,12 @@ pub fn check(path: &str, settings: &GlobalSettings) -> i32 {
) > max_allowed_cmp
{
if !settings.check_silent {
eprintln!("sort: {}:{}: disorder: {}", path, line_idx, new_first.line);
eprintln!(
"sort: {}:{}: disorder: {}",
path.to_string_lossy(),
line_idx,
new_first.line
);
}
return 1;
}
@ -74,7 +80,12 @@ pub fn check(path: &str, settings: &GlobalSettings) -> i32 {
line_idx += 1;
if compare_by(a, b, settings, chunk.line_data(), chunk.line_data()) > max_allowed_cmp {
if !settings.check_silent {
eprintln!("sort: {}:{}: disorder: {}", path, line_idx, b.line);
eprintln!(
"sort: {}:{}: disorder: {}",
path.to_string_lossy(),
line_idx,
b.line
);
}
return 1;
}

View file

@ -9,10 +9,11 @@
use std::{
cmp::Ordering,
ffi::OsString,
fs::{self, File},
io::{BufWriter, Read, Write},
iter,
path::PathBuf,
path::{Path, PathBuf},
process::{Child, ChildStdin, ChildStdout, Command, Stdio},
rc::Rc,
sync::mpsc::{channel, sync_channel, Receiver, Sender, SyncSender},
@ -25,28 +26,66 @@ use tempfile::TempDir;
use crate::{
chunks::{self, Chunk, RecycledChunk},
compare_by, GlobalSettings, Output,
compare_by, open, GlobalSettings, Output,
};
/// If the output file occurs in the input files as well, copy the contents of the output file
/// and replace its occurrences in the inputs with that copy.
fn replace_output_file_in_input_files(
files: &mut [OsString],
settings: &GlobalSettings,
output: Option<&str>,
) -> Option<(TempDir, usize)> {
let mut copy: Option<(TempDir, PathBuf)> = None;
if let Some(Ok(output_path)) = output.map(|path| Path::new(path).canonicalize()) {
for file in files {
if let Ok(file_path) = Path::new(file).canonicalize() {
if file_path == output_path {
if let Some((_dir, copy)) = &copy {
*file = copy.clone().into_os_string();
} else {
let tmp_dir = tempfile::Builder::new()
.prefix("uutils_sort")
.tempdir_in(&settings.tmp_dir)
.unwrap();
let copy_path = tmp_dir.path().join("0");
std::fs::copy(file_path, &copy_path).unwrap();
*file = copy_path.clone().into_os_string();
copy = Some((tmp_dir, copy_path))
}
}
}
}
}
// if we created a TempDir its size must be one.
copy.map(|(dir, _copy)| (dir, 1))
}
/// Merge pre-sorted `Box<dyn Read>`s.
///
/// If `settings.merge_batch_size` is greater than the length of `files`, intermediate files will be used.
/// If `settings.compress_prog` is `Some`, intermediate files will be compressed with it.
pub fn merge<Files: ExactSizeIterator<Item = Box<dyn Read + Send>>>(
files: Files,
settings: &GlobalSettings,
) -> FileMerger {
pub fn merge<'a>(
files: &mut [OsString],
settings: &'a GlobalSettings,
output: Option<&str>,
) -> FileMerger<'a> {
let tmp_dir = replace_output_file_in_input_files(files, settings, output);
if settings.compress_prog.is_none() {
merge_with_file_limit::<_, _, WriteablePlainTmpFile>(
files.map(|file| PlainMergeInput { inner: file }),
files
.iter()
.map(|file| PlainMergeInput { inner: open(file) }),
settings,
None,
tmp_dir,
)
} else {
merge_with_file_limit::<_, _, WriteableCompressedTmpFile>(
files.map(|file| PlainMergeInput { inner: file }),
files
.iter()
.map(|file| PlainMergeInput { inner: open(file) }),
settings,
None,
tmp_dir,
)
}
}

View file

@ -33,8 +33,8 @@ use rand::{thread_rng, Rng};
use rayon::prelude::*;
use std::cmp::Ordering;
use std::env;
use std::ffi::OsStr;
use std::fs::File;
use std::ffi::{OsStr, OsString};
use std::fs::{File, OpenOptions};
use std::hash::{Hash, Hasher};
use std::io::{stdin, stdout, BufRead, BufReader, Read, Write};
use std::ops::Range;
@ -146,26 +146,46 @@ impl SortMode {
}
pub struct Output {
file: Option<File>,
file: Option<(String, File)>,
}
impl Output {
fn new(name: Option<&str>) -> Self {
Self {
file: name.map(|name| {
File::create(name).unwrap_or_else(|e| {
// This is different from `File::create()` because we don't truncate the output yet.
// This allows using the output file as an input file.
(
name.to_owned(),
OpenOptions::new()
.write(true)
.create(true)
.open(name)
.unwrap_or_else(|e| {
crash!(2, "open failed: {}: {}", name, strip_errno(&e.to_string()))
})
}),
)
}),
}
}
fn into_write(self) -> Box<dyn Write> {
match self.file {
Some(file) => Box::new(file),
Some((_name, file)) => {
// truncate the file
file.set_len(0).unwrap();
Box::new(file)
}
None => Box::new(stdout()),
}
}
fn as_output_name(&self) -> Option<&str> {
match &self.file {
Some((name, _file)) => Some(name),
None => None,
}
}
}
#[derive(Clone)]
@ -956,29 +976,28 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
settings.debug = matches.is_present(options::DEBUG);
// check whether user specified a zero terminated list of files for input, otherwise read files from args
let mut files: Vec<String> = if matches.is_present(options::FILES0_FROM) {
let files0_from: Vec<String> = matches
.values_of(options::FILES0_FROM)
.map(|v| v.map(ToString::to_string).collect())
let mut files: Vec<OsString> = if matches.is_present(options::FILES0_FROM) {
let files0_from: Vec<OsString> = matches
.values_of_os(options::FILES0_FROM)
.map(|v| v.map(ToOwned::to_owned).collect())
.unwrap_or_default();
let mut files = Vec::new();
for path in &files0_from {
let reader = open(path.as_str());
let reader = open(&path);
let buf_reader = BufReader::new(reader);
for line in buf_reader.split(b'\0').flatten() {
files.push(
files.push(OsString::from(
std::str::from_utf8(&line)
.expect("Could not parse string from zero terminated input.")
.to_string(),
);
.expect("Could not parse string from zero terminated input."),
));
}
}
files
} else {
matches
.values_of(options::FILES)
.map(|v| v.map(ToString::to_string).collect())
.values_of_os(options::FILES)
.map(|v| v.map(ToOwned::to_owned).collect())
.unwrap_or_default()
};
@ -1066,9 +1085,13 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
if files.is_empty() {
/* if no file, default to stdin */
files.push("-".to_owned());
files.push("-".to_string().into());
} else if settings.check && files.len() != 1 {
crash!(2, "extra operand `{}' not allowed with -c", files[1])
crash!(
2,
"extra operand `{}' not allowed with -c",
files[1].to_string_lossy()
)
}
if let Some(arg) = matches.args.get(options::SEPARATOR) {
@ -1122,7 +1145,7 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
settings.init_precomputed();
exec(&files, &settings, output)
exec(&mut files, &settings, output)
}
pub fn uu_app() -> App<'static, 'static> {
@ -1345,9 +1368,9 @@ pub fn uu_app() -> App<'static, 'static> {
)
}
fn exec(files: &[String], settings: &GlobalSettings, output: Output) -> i32 {
fn exec(files: &mut [OsString], settings: &GlobalSettings, output: Output) -> i32 {
if settings.merge {
let mut file_merger = merge::merge(files.iter().map(open), settings);
let mut file_merger = merge::merge(files, settings, output.as_output_name());
file_merger.write_all(settings, output);
} else if settings.check {
if files.len() > 1 {

View file

@ -1020,3 +1020,13 @@ fn test_separator_null() {
.succeeds()
.stdout_only("a\0z\0z\nz\0b\0a\nz\0a\0b\n");
}
#[test]
fn test_output_is_input() {
let input = "a\nb\nc\n";
let (at, mut cmd) = at_and_ucmd!();
at.touch("file");
at.append("file", input);
cmd.args(&["-m", "-o", "file", "file"]).succeeds();
assert_eq!(at.read("file"), input);
}