1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-30 12:37:49 +00:00

Merge branch 'master' of github.com:uutils/coreutils into refactoring_parse_size

This commit is contained in:
Jan Scheer 2021-06-06 22:54:02 +02:00
commit 12de58aec0
20 changed files with 519 additions and 148 deletions

42
Cargo.lock generated
View file

@ -44,13 +44,16 @@ dependencies = [
]
[[package]]
name = "arrayvec"
version = "0.4.12"
name = "arrayref"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd9fd44efafa8690358b7408d253adf110036b88f55672a933f01d616ad9b1b9"
dependencies = [
"nodrop",
]
checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544"
[[package]]
name = "arrayvec"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b"
[[package]]
name = "atty"
@ -100,11 +103,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
[[package]]
name = "blake2-rfc"
version = "0.2.18"
name = "blake2b_simd"
version = "0.5.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d6d530bdd2d52966a6d03b7a964add7ae1a288d25214066fd4b600f0f796400"
checksum = "afa748e348ad3be8263be728124b24a24f268266f6f5d58af9d75f6a40b5c587"
dependencies = [
"arrayref",
"arrayvec",
"constant_time_eq",
]
@ -700,9 +704,9 @@ checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3"
[[package]]
name = "heck"
version = "0.3.2"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87cbf45460356b7deeb5e3415b5563308c0a9b057c85e12b06ad551f98d0a6ac"
checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
dependencies = [
"unicode-segmentation",
]
@ -1383,12 +1387,9 @@ dependencies = [
[[package]]
name = "regex-automata"
version = "0.1.9"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
dependencies = [
"byteorder",
]
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
[[package]]
name = "regex-syntax"
@ -1501,9 +1502,9 @@ dependencies = [
[[package]]
name = "signal-hook-registry"
version = "1.3.0"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16f1d0fef1604ba8f7a073c7e701f213e056707210e9020af4528e0101ce11a6"
checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0"
dependencies = [
"libc",
]
@ -1904,6 +1905,7 @@ dependencies = [
name = "uu_dircolors"
version = "0.0.6"
dependencies = [
"clap",
"glob 0.3.0",
"uucore",
"uucore_procs",
@ -2028,7 +2030,7 @@ dependencies = [
name = "uu_hashsum"
version = "0.0.6"
dependencies = [
"blake2-rfc",
"blake2b_simd",
"clap",
"digest",
"hex",
@ -2215,6 +2217,8 @@ dependencies = [
"nix 0.13.1",
"redox_syscall 0.1.57",
"redox_termios",
"unicode-segmentation",
"unicode-width",
"uucore",
"uucore_procs",
]

View file

@ -342,22 +342,22 @@ To contribute to uutils, please see [CONTRIBUTING](CONTRIBUTING.md).
| Done | Semi-Done | To Do |
|-----------|-----------|--------|
| arch | cp | chcon |
| base32 | expr | csplit |
| base64 | install | dd |
| basename | ls | df |
| cat | more | numfmt |
| chgrp | od (`--strings` and 128-bit data types missing) | runcon |
| chmod | printf | stty |
| chown | sort | |
| chroot | split | |
| cksum | tail | |
| comm | test | |
| csplit | date | |
| cut | join | |
| dircolors | df | |
| base32 | date | dd |
| base64 | df | runcon |
| basename | expr | stty |
| cat | install | |
| chgrp | join | |
| chmod | ls | |
| chown | more | |
| chroot | numfmt | |
| cksum | od (`--strings` and 128-bit data types missing) | |
| comm | pr | |
| csplit | printf | |
| cut | sort | |
| dircolors | split | |
| dirname | tac | |
| du | pr | |
| echo | | |
| du | tail | |
| echo | test | |
| env | | |
| expand | | |
| factor | | |
@ -374,12 +374,12 @@ To contribute to uutils, please see [CONTRIBUTING](CONTRIBUTING.md).
| link | | |
| ln | | |
| logname | | |
| ~~md5sum~~ (replaced by [hashsum](https://github.com/uutils/coreutils/blob/master/src/uu/hashsum/src/hashsum.rs)) | |
| ~~sha1sum~~ (replaced by [hashsum](https://github.com/uutils/coreutils/blob/master/src/uu/hashsum/src/hashsum.rs)) | |
| ~~sha224sum~~ (replaced by [hashsum](https://github.com/uutils/coreutils/blob/master/src/uu/hashsum/src/hashsum.rs)) | |
| ~~sha256sum~~ (replaced by [hashsum](https://github.com/uutils/coreutils/blob/master/src/uu/hashsum/src/hashsum.rs)) | |
| ~~sha384sum~~ (replaced by [hashsum](https://github.com/uutils/coreutils/blob/master/src/uu/hashsum/src/hashsum.rs)) | |
| ~~sha512sum~~ (replaced by [hashsum](https://github.com/uutils/coreutils/blob/master/src/uu/hashsum/src/hashsum.rs)) | |
| ~~md5sum~~ (replaced by [hashsum](https://github.com/uutils/coreutils/blob/master/src/uu/hashsum/src/hashsum.rs)) | | |
| ~~sha1sum~~ (replaced by [hashsum](https://github.com/uutils/coreutils/blob/master/src/uu/hashsum/src/hashsum.rs)) | | |
| ~~sha224sum~~ (replaced by [hashsum](https://github.com/uutils/coreutils/blob/master/src/uu/hashsum/src/hashsum.rs)) | | |
| ~~sha256sum~~ (replaced by [hashsum](https://github.com/uutils/coreutils/blob/master/src/uu/hashsum/src/hashsum.rs)) | | |
| ~~sha384sum~~ (replaced by [hashsum](https://github.com/uutils/coreutils/blob/master/src/uu/hashsum/src/hashsum.rs)) | | |
| ~~sha512sum~~ (replaced by [hashsum](https://github.com/uutils/coreutils/blob/master/src/uu/hashsum/src/hashsum.rs)) | | |
| mkdir | | |
| mkfifo | | |
| mknod | | |

View file

@ -15,6 +15,7 @@ edition = "2018"
path = "src/dircolors.rs"
[dependencies]
clap = "2.33"
glob = "0.3.0"
uucore = { version=">=0.0.8", package="uucore", path="../../uucore" }
uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }

View file

@ -1,6 +1,7 @@
// This file is part of the uutils coreutils package.
//
// (c) Jian Zeng <anonymousknight96@gmail.com>
// (c) Mitchell Mebane <mitchell.mebane@gmail.com>
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
@ -15,6 +16,15 @@ use std::env;
use std::fs::File;
use std::io::{BufRead, BufReader};
use clap::{crate_version, App, Arg};
mod options {
pub const BOURNE_SHELL: &str = "bourne-shell";
pub const C_SHELL: &str = "c-shell";
pub const PRINT_DATABASE: &str = "print-database";
pub const FILE: &str = "FILE";
}
static SYNTAX: &str = "[OPTION]... [FILE]";
static SUMMARY: &str = "Output commands to set the LS_COLORS environment variable.";
static LONG_HELP: &str = "
@ -52,28 +62,56 @@ pub fn guess_syntax() -> OutputFmt {
}
}
fn get_usage() -> String {
format!("{0} {1}", executable!(), SYNTAX)
}
pub fn uumain(args: impl uucore::Args) -> i32 {
let args = args
.collect_str(InvalidEncodingHandling::Ignore)
.accept_any();
let matches = app!(SYNTAX, SUMMARY, LONG_HELP)
.optflag("b", "sh", "output Bourne shell code to set LS_COLORS")
.optflag(
"",
"bourne-shell",
"output Bourne shell code to set LS_COLORS",
)
.optflag("c", "csh", "output C shell code to set LS_COLORS")
.optflag("", "c-shell", "output C shell code to set LS_COLORS")
.optflag("p", "print-database", "print the byte counts")
.parse(args);
let usage = get_usage();
if (matches.opt_present("csh")
|| matches.opt_present("c-shell")
|| matches.opt_present("sh")
|| matches.opt_present("bourne-shell"))
&& matches.opt_present("print-database")
let matches = App::new(executable!())
.version(crate_version!())
.about(SUMMARY)
.usage(&usage[..])
.after_help(LONG_HELP)
.arg(
Arg::with_name(options::BOURNE_SHELL)
.long("sh")
.short("b")
.visible_alias("bourne-shell")
.help("output Bourne shell code to set LS_COLORS")
.display_order(1),
)
.arg(
Arg::with_name(options::C_SHELL)
.long("csh")
.short("c")
.visible_alias("c-shell")
.help("output C shell code to set LS_COLORS")
.display_order(2),
)
.arg(
Arg::with_name(options::PRINT_DATABASE)
.long("print-database")
.short("p")
.help("print the byte counts")
.display_order(3),
)
.arg(Arg::with_name(options::FILE).hidden(true).multiple(true))
.get_matches_from(&args);
let files = matches
.values_of(options::FILE)
.map_or(vec![], |file_values| file_values.collect());
// clap provides .conflicts_with / .conflicts_with_all, but we want to
// manually handle conflicts so we can match the output of GNU coreutils
if (matches.is_present(options::C_SHELL) || matches.is_present(options::BOURNE_SHELL))
&& matches.is_present(options::PRINT_DATABASE)
{
show_usage_error!(
"the options to output dircolors' internal database and\nto select a shell \
@ -82,12 +120,12 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
return 1;
}
if matches.opt_present("print-database") {
if !matches.free.is_empty() {
if matches.is_present(options::PRINT_DATABASE) {
if !files.is_empty() {
show_usage_error!(
"extra operand {}\nfile operands cannot be combined with \
--print-database (-p)",
matches.free[0]
files[0]
);
return 1;
}
@ -96,9 +134,9 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
}
let mut out_format = OutputFmt::Unknown;
if matches.opt_present("csh") || matches.opt_present("c-shell") {
if matches.is_present(options::C_SHELL) {
out_format = OutputFmt::CShell;
} else if matches.opt_present("sh") || matches.opt_present("bourne-shell") {
} else if matches.is_present(options::BOURNE_SHELL) {
out_format = OutputFmt::Shell;
}
@ -113,24 +151,20 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
}
let result;
if matches.free.is_empty() {
if files.is_empty() {
result = parse(INTERNAL_DB.lines(), out_format, "")
} else {
if matches.free.len() > 1 {
show_usage_error!("extra operand {}", matches.free[1]);
if files.len() > 1 {
show_usage_error!("extra operand {}", files[1]);
return 1;
}
match File::open(matches.free[0].as_str()) {
match File::open(files[0]) {
Ok(f) => {
let fin = BufReader::new(f);
result = parse(
fin.lines().filter_map(Result::ok),
out_format,
matches.free[0].as_str(),
)
result = parse(fin.lines().filter_map(Result::ok), out_format, files[0])
}
Err(e) => {
show_error!("{}: {}", matches.free[0], e);
show_error!("{}: {}", files[0], e);
return 1;
}
}

View file

@ -393,6 +393,7 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
although the apparent size is usually smaller, it may be larger due to holes \
in ('sparse') files, internal fragmentation, indirect blocks, and the like"
)
.alias("app") // The GNU testsuite uses this alias
)
.arg(
Arg::with_name(options::BLOCK_SIZE)

View file

@ -0,0 +1,9 @@
## Benchmarking hashsum
### To bench blake2
Taken from: https://github.com/uutils/coreutils/pull/2296
With a large file:
$ hyperfine "./target/release/coreutils hashsum --b2sum large-file" "b2sum large-file"

View file

@ -25,7 +25,7 @@ regex-syntax = "0.6.7"
sha1 = "0.6.0"
sha2 = "0.6.0"
sha3 = "0.6.0"
blake2-rfc = "0.2.18"
blake2b_simd = "0.5.11"
uucore = { version=">=0.0.8", package="uucore", path="../../uucore" }
uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }

View file

@ -1,4 +1,3 @@
extern crate blake2_rfc;
extern crate digest;
extern crate md5;
extern crate sha1;
@ -49,9 +48,9 @@ impl Digest for md5::Context {
}
}
impl Digest for blake2_rfc::blake2b::Blake2b {
impl Digest for blake2b_simd::State {
fn new() -> Self {
blake2_rfc::blake2b::Blake2b::new(64)
Self::new()
}
fn input(&mut self, input: &[u8]) {
@ -59,12 +58,12 @@ impl Digest for blake2_rfc::blake2b::Blake2b {
}
fn result(&mut self, out: &mut [u8]) {
let hash_result = &self.clone().finalize();
let hash_result = &self.finalize();
out.copy_from_slice(&hash_result.as_bytes());
}
fn reset(&mut self) {
*self = blake2_rfc::blake2b::Blake2b::new(64);
*self = Self::new();
}
fn output_bits(&self) -> usize {

View file

@ -19,7 +19,6 @@ mod digest;
use self::digest::Digest;
use blake2_rfc::blake2b::Blake2b;
use clap::{App, Arg, ArgMatches};
use hex::ToHex;
use md5::Context as Md5;
@ -85,7 +84,11 @@ fn detect_algo<'a>(
"sha256sum" => ("SHA256", Box::new(Sha256::new()) as Box<dyn Digest>, 256),
"sha384sum" => ("SHA384", Box::new(Sha384::new()) as Box<dyn Digest>, 384),
"sha512sum" => ("SHA512", Box::new(Sha512::new()) as Box<dyn Digest>, 512),
"b2sum" => ("BLAKE2", Box::new(Blake2b::new(64)) as Box<dyn Digest>, 512),
"b2sum" => (
"BLAKE2",
Box::new(blake2b_simd::State::new()) as Box<dyn Digest>,
512,
),
"sha3sum" => match matches.value_of("bits") {
Some(bits_str) => match (&bits_str).parse::<usize>() {
Ok(224) => (
@ -187,7 +190,7 @@ fn detect_algo<'a>(
set_or_crash("SHA512", Box::new(Sha512::new()), 512)
}
if matches.is_present("b2sum") {
set_or_crash("BLAKE2", Box::new(Blake2b::new(64)), 512)
set_or_crash("BLAKE2", Box::new(blake2b_simd::State::new()), 512)
}
if matches.is_present("sha3") {
match matches.value_of("bits") {

View file

@ -20,6 +20,8 @@ uucore = { version = ">=0.0.7", package = "uucore", path = "../../uucore" }
uucore_procs = { version = ">=0.0.5", package = "uucore_procs", path = "../../uucore_procs" }
crossterm = ">=0.19"
atty = "0.2.14"
unicode-width = "0.1.7"
unicode-segmentation = "1.7.1"
[target.'cfg(target_os = "redox")'.dependencies]
redox_termios = "0.1"

View file

@ -29,6 +29,9 @@ use crossterm::{
terminal,
};
use unicode_segmentation::UnicodeSegmentation;
use unicode_width::UnicodeWidthStr;
pub mod options {
pub const SILENT: &str = "silent";
pub const LOGICAL: &str = "logical";
@ -140,7 +143,9 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
if let Some(files) = matches.values_of(options::FILES) {
let mut stdout = setup_term();
let length = files.len();
for (idx, file) in files.enumerate() {
let mut files_iter = files.peekable();
while let (Some(file), next_file) = (files_iter.next(), files_iter.peek()) {
let file = Path::new(file);
if file.is_dir() {
terminal::disable_raw_mode().unwrap();
@ -157,15 +162,14 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
}
let mut reader = BufReader::new(File::open(file).unwrap());
reader.read_to_string(&mut buff).unwrap();
let is_last = idx + 1 == length;
more(&buff, &mut stdout, is_last);
more(&buff, &mut stdout, next_file.copied());
buff.clear();
}
reset_term(&mut stdout);
} else if atty::isnt(atty::Stream::Stdin) {
stdin().read_to_string(&mut buff).unwrap();
let mut stdout = setup_term();
more(&buff, &mut stdout, true);
more(&buff, &mut stdout, None);
reset_term(&mut stdout);
} else {
show_usage_error!("bad usage");
@ -200,7 +204,7 @@ fn reset_term(stdout: &mut std::io::Stdout) {
#[inline(always)]
fn reset_term(_: &mut usize) {}
fn more(buff: &str, mut stdout: &mut Stdout, is_last: bool) {
fn more(buff: &str, mut stdout: &mut Stdout, next_file: Option<&str>) {
let (cols, rows) = terminal::size().unwrap();
let lines = break_buff(buff, usize::from(cols));
let line_count: u16 = lines.len().try_into().unwrap();
@ -214,8 +218,11 @@ fn more(buff: &str, mut stdout: &mut Stdout, is_last: bool) {
&mut stdout,
lines.clone(),
line_count,
next_file,
);
let is_last = next_file.is_none();
// Specifies whether we have reached the end of the file and should
// return on the next key press. However, we immediately return when
// this is the last file.
@ -267,6 +274,7 @@ fn more(buff: &str, mut stdout: &mut Stdout, is_last: bool) {
&mut stdout,
lines.clone(),
line_count,
next_file,
);
if lines_left == 0 {
@ -285,6 +293,7 @@ fn draw(
mut stdout: &mut std::io::Stdout,
lines: Vec<String>,
lc: u16,
next_file: Option<&str>,
) {
execute!(stdout, terminal::Clear(terminal::ClearType::CurrentLine)).unwrap();
let (up_mark, lower_mark) = calc_range(*upper_mark, rows, lc);
@ -299,7 +308,7 @@ fn draw(
.write_all(format!("\r{}\n", line).as_bytes())
.unwrap();
}
make_prompt_and_flush(&mut stdout, lower_mark, lc);
make_prompt_and_flush(&mut stdout, lower_mark, lc, next_file);
*upper_mark = up_mark;
}
@ -313,23 +322,30 @@ fn break_buff(buff: &str, cols: usize) -> Vec<String> {
lines
}
fn break_line(mut line: &str, cols: usize) -> Vec<String> {
let breaks = (line.len() / cols).saturating_add(1);
let mut lines = Vec::with_capacity(breaks);
// TODO: Use unicode width instead of the length in bytes.
if line.len() < cols {
fn break_line(line: &str, cols: usize) -> Vec<String> {
let width = UnicodeWidthStr::width(line);
let mut lines = Vec::new();
if width < cols {
lines.push(line.to_string());
return lines;
}
for _ in 1..=breaks {
let (line1, line2) = line.split_at(cols);
lines.push(line1.to_string());
if line2.len() < cols {
lines.push(line2.to_string());
break;
let gr_idx = UnicodeSegmentation::grapheme_indices(line, true);
let mut last_index = 0;
let mut total_width = 0;
for (index, grapheme) in gr_idx {
let width = UnicodeWidthStr::width(grapheme);
total_width += width;
if total_width > cols {
lines.push(line[last_index..index].to_string());
last_index = index;
total_width = width;
}
line = line2;
}
if last_index != line.len() {
lines.push(line[last_index..].to_string());
}
lines
}
@ -339,7 +355,7 @@ fn calc_range(mut upper_mark: u16, rows: u16, line_count: u16) -> (u16, u16) {
let mut lower_mark = upper_mark.saturating_add(rows);
if lower_mark >= line_count {
upper_mark = line_count.saturating_sub(rows);
upper_mark = line_count.saturating_sub(rows).saturating_add(1);
lower_mark = line_count;
} else {
lower_mark = lower_mark.saturating_sub(1)
@ -348,12 +364,20 @@ fn calc_range(mut upper_mark: u16, rows: u16, line_count: u16) -> (u16, u16) {
}
// Make a prompt similar to original more
fn make_prompt_and_flush(stdout: &mut Stdout, lower_mark: u16, lc: u16) {
fn make_prompt_and_flush(stdout: &mut Stdout, lower_mark: u16, lc: u16, next_file: Option<&str>) {
let status = if lower_mark == lc {
format!("Next file: {}", next_file.unwrap_or_default())
} else {
format!(
"{}%",
(lower_mark as f64 / lc as f64 * 100.0).round() as u16
)
};
write!(
stdout,
"\r{}--More--({}%){}",
"\r{}--More--({}){}",
Attribute::Reverse,
((lower_mark as f64 / lc as f64) * 100.0).round() as u16,
status,
Attribute::Reset
)
.unwrap();
@ -363,13 +387,14 @@ fn make_prompt_and_flush(stdout: &mut Stdout, lower_mark: u16, lc: u16) {
#[cfg(test)]
mod tests {
use super::{break_line, calc_range};
use unicode_width::UnicodeWidthStr;
// It is good to test the above functions
#[test]
fn test_calc_range() {
assert_eq!((0, 24), calc_range(0, 25, 100));
assert_eq!((50, 74), calc_range(50, 25, 100));
assert_eq!((75, 100), calc_range(85, 25, 100));
assert_eq!((76, 100), calc_range(85, 25, 100));
}
#[test]
fn test_break_lines_long() {
@ -379,11 +404,12 @@ mod tests {
}
let lines = break_line(&test_string, 80);
let widths: Vec<usize> = lines
.iter()
.map(|s| UnicodeWidthStr::width(&s[..]))
.collect();
assert_eq!(
(80, 80, 40),
(lines[0].len(), lines[1].len(), lines[2].len())
);
assert_eq!((80, 80, 40), (widths[0], widths[1], widths[2]));
}
#[test]
@ -397,4 +423,22 @@ mod tests {
assert_eq!(20, lines[0].len());
}
#[test]
fn test_break_line_zwj() {
let mut test_string = String::with_capacity(1100);
for _ in 0..20 {
test_string.push_str("👩🏻‍🔬");
}
let lines = break_line(&test_string, 80);
let widths: Vec<usize> = lines
.iter()
.map(|s| UnicodeWidthStr::width(&s[..]))
.collect();
// Each 👩🏻‍🔬 is 6 character width it break line to the closest number to 80 => 6 * 13 = 78
assert_eq!((78, 42), (widths[0], widths[1]));
}
}

View file

@ -255,7 +255,18 @@ fn handle_dir(path: &Path, options: &Options) -> bool {
// correctly on Windows
if let Err(e) = remove_dir_all(path) {
had_err = true;
show_error!("could not remove '{}': {}", path.display(), e);
if e.kind() == std::io::ErrorKind::PermissionDenied {
// GNU compatibility (rm/fail-eacces.sh)
// here, GNU doesn't use some kind of remove_dir_all
// It will show directory+file
show_error!(
"cannot remove '{}': {}",
path.display(),
"Permission denied"
);
} else {
show_error!("cannot remove '{}': {}", path.display(), e);
}
}
} else {
let mut dirs: VecDeque<DirEntry> = VecDeque::new();
@ -314,7 +325,16 @@ fn remove_dir(path: &Path, options: &Options) -> bool {
}
}
Err(e) => {
show_error!("cannot remove '{}': {}", path.display(), e);
if e.kind() == std::io::ErrorKind::PermissionDenied {
// GNU compatibility (rm/fail-eacces.sh)
show_error!(
"cannot remove '{}': {}",
path.display(),
"Permission denied"
);
} else {
show_error!("cannot remove '{}': {}", path.display(), e);
}
return true;
}
}
@ -352,7 +372,16 @@ fn remove_file(path: &Path, options: &Options) -> bool {
}
}
Err(e) => {
show_error!("removing '{}': {}", path.display(), e);
if e.kind() == std::io::ErrorKind::PermissionDenied {
// GNU compatibility (rm/fail-eacces.sh)
show_error!(
"cannot remove '{}': {}",
path.display(),
"Permission denied"
);
} else {
show_error!("cannot remove '{}': {}", path.display(), e);
}
return true;
}
}

View file

@ -102,17 +102,17 @@ pub fn read(
carry_over.clear();
carry_over.extend_from_slice(&buffer[read..]);
let payload = Chunk::new(buffer, |buf| {
let mut lines = unsafe {
// SAFETY: It is safe to transmute to a vector of lines with shorter lifetime,
// because it was only temporarily transmuted to a Vec<Line<'static>> to make recycling possible.
std::mem::transmute::<Vec<Line<'static>>, Vec<Line<'_>>>(lines)
};
let read = crash_if_err!(1, std::str::from_utf8(&buf[..read]));
parse_lines(read, &mut lines, separator, &settings);
lines
});
if !payload.borrow_lines().is_empty() {
if read != 0 {
let payload = Chunk::new(buffer, |buf| {
let mut lines = unsafe {
// SAFETY: It is safe to transmute to a vector of lines with shorter lifetime,
// because it was only temporarily transmuted to a Vec<Line<'static>> to make recycling possible.
std::mem::transmute::<Vec<Line<'static>>, Vec<Line<'_>>>(lines)
};
let read = crash_if_err!(1, std::str::from_utf8(&buf[..read]));
parse_lines(read, &mut lines, separator, &settings);
lines
});
sender.send(payload).unwrap();
}
if !should_continue {
@ -175,6 +175,7 @@ fn read_to_buffer(
separator: u8,
) -> (usize, bool) {
let mut read_target = &mut buffer[start_offset..];
let mut last_file_target_size = read_target.len();
loop {
match file.read(read_target) {
Ok(0) => {
@ -208,14 +209,27 @@ fn read_to_buffer(
read_target = &mut buffer[len..];
}
} else {
// This file is empty.
// This file has been fully read.
let mut leftover_len = read_target.len();
if last_file_target_size != leftover_len {
// The file was not empty.
let read_len = buffer.len() - leftover_len;
if buffer[read_len - 1] != separator {
// The file did not end with a separator. We have to insert one.
buffer[read_len] = separator;
leftover_len -= 1;
}
let read_len = buffer.len() - leftover_len;
read_target = &mut buffer[read_len..];
}
if let Some(next_file) = next_files.next() {
// There is another file.
last_file_target_size = leftover_len;
*file = next_file;
} else {
// This was the last file.
let leftover_len = read_target.len();
return (buffer.len() - leftover_len, false);
let read_len = buffer.len() - leftover_len;
return (read_len, false);
}
}
}

View file

@ -12,8 +12,12 @@
//! The buffers for the individual chunks are recycled. There are two buffers.
use std::cmp::Ordering;
use std::fs::File;
use std::io::BufReader;
use std::io::{BufWriter, Write};
use std::path::Path;
use std::process::Child;
use std::process::{Command, Stdio};
use std::{
fs::OpenOptions,
io::Read,
@ -25,12 +29,13 @@ use itertools::Itertools;
use tempfile::TempDir;
use crate::Line;
use crate::{
chunks::{self, Chunk},
compare_by, merge, output_sorted_lines, sort_by, GlobalSettings,
};
const MIN_BUFFER_SIZE: usize = 8_000;
const START_BUFFER_SIZE: usize = 8_000;
/// Sort files by using auxiliary files for storing intermediate chunks (if needed), and output the result.
pub fn ext_sort(files: &mut impl Iterator<Item = Box<dyn Read + Send>>, settings: &GlobalSettings) {
@ -63,10 +68,31 @@ pub fn ext_sort(files: &mut impl Iterator<Item = Box<dyn Read + Send>>, settings
);
match read_result {
ReadResult::WroteChunksToFile { chunks_written } => {
let files = (0..chunks_written)
.map(|chunk_num| tmp_dir.path().join(chunk_num.to_string()))
.collect::<Vec<_>>();
let mut merger = merge::merge(&files, settings);
let mut children = Vec::new();
let files = (0..chunks_written).map(|chunk_num| {
let file_path = tmp_dir.path().join(chunk_num.to_string());
let file = File::open(file_path).unwrap();
if let Some(compress_prog) = &settings.compress_prog {
let mut command = Command::new(compress_prog);
command.stdin(file).stdout(Stdio::piped()).arg("-d");
let mut child = crash_if_err!(
2,
command.spawn().map_err(|err| format!(
"couldn't execute compress program: errno {}",
err.raw_os_error().unwrap()
))
);
let child_stdout = child.stdout.take().unwrap();
children.push(child);
Box::new(BufReader::new(child_stdout)) as Box<dyn Read + Send>
} else {
Box::new(BufReader::new(file)) as Box<dyn Read + Send>
}
});
let mut merger = merge::merge_with_file_limit(files, settings);
for child in children {
assert_child_success(child, settings.compress_prog.as_ref().unwrap());
}
merger.write_all(settings);
}
ReadResult::SortedSingleChunk(chunk) => {
@ -132,7 +158,14 @@ fn reader_writer(
for _ in 0..2 {
chunks::read(
&mut sender_option,
vec![0; MIN_BUFFER_SIZE],
vec![
0;
if START_BUFFER_SIZE < buffer_size {
START_BUFFER_SIZE
} else {
buffer_size
}
],
Some(buffer_size),
&mut carry_over,
&mut file,
@ -171,6 +204,7 @@ fn reader_writer(
write(
&mut chunk,
&tmp_dir.path().join(file_number.to_string()),
settings.compress_prog.as_deref(),
separator,
);
@ -193,14 +227,45 @@ fn reader_writer(
}
/// Write the lines in `chunk` to `file`, separated by `separator`.
fn write(chunk: &mut Chunk, file: &Path, separator: u8) {
/// `compress_prog` is used to optionally compress file contents.
fn write(chunk: &mut Chunk, file: &Path, compress_prog: Option<&str>, separator: u8) {
chunk.with_lines_mut(|lines| {
// Write the lines to the file
let file = crash_if_err!(1, OpenOptions::new().create(true).write(true).open(file));
let mut writer = BufWriter::new(file);
for s in lines.iter() {
crash_if_err!(1, writer.write_all(s.line.as_bytes()));
crash_if_err!(1, writer.write_all(&[separator]));
}
if let Some(compress_prog) = compress_prog {
let mut command = Command::new(compress_prog);
command.stdin(Stdio::piped()).stdout(file);
let mut child = crash_if_err!(
2,
command.spawn().map_err(|err| format!(
"couldn't execute compress program: errno {}",
err.raw_os_error().unwrap()
))
);
let mut writer = BufWriter::new(child.stdin.take().unwrap());
write_lines(lines, &mut writer, separator);
writer.flush().unwrap();
drop(writer);
assert_child_success(child, compress_prog);
} else {
let mut writer = BufWriter::new(file);
write_lines(lines, &mut writer, separator);
};
});
}
fn write_lines<'a, T: Write>(lines: &[Line<'a>], writer: &mut T, separator: u8) {
for s in lines {
crash_if_err!(1, writer.write_all(s.line.as_bytes()));
crash_if_err!(1, writer.write_all(&[separator]));
}
}
fn assert_child_success(mut child: Child, program: &str) {
if !matches!(
child.wait().map(|e| e.code()),
Ok(Some(0)) | Ok(None) | Err(_)
) {
crash!(2, "'{}' terminated abnormally", program)
}
}

View file

@ -9,8 +9,8 @@
use std::{
cmp::Ordering,
ffi::OsStr,
io::{Read, Write},
fs::File,
io::{BufWriter, Read, Write},
iter,
rc::Rc,
sync::mpsc::{channel, sync_channel, Receiver, Sender, SyncSender},
@ -18,18 +18,69 @@ use std::{
};
use compare::Compare;
use itertools::Itertools;
use crate::{
chunks::{self, Chunk},
compare_by, open, GlobalSettings,
compare_by, GlobalSettings,
};
// Merge already sorted files.
pub fn merge<'a>(files: &[impl AsRef<OsStr>], settings: &'a GlobalSettings) -> FileMerger<'a> {
pub fn merge_with_file_limit<F: ExactSizeIterator<Item = Box<dyn Read + Send>>>(
files: F,
settings: &GlobalSettings,
) -> FileMerger {
if files.len() > settings.merge_batch_size {
let tmp_dir = tempfile::Builder::new()
.prefix("uutils_sort")
.tempdir_in(&settings.tmp_dir)
.unwrap();
let mut batch_number = 0;
let mut remaining_files = files.len();
let batches = files.chunks(settings.merge_batch_size);
let mut batches = batches.into_iter();
while batch_number + remaining_files > settings.merge_batch_size && remaining_files != 0 {
remaining_files = remaining_files.saturating_sub(settings.merge_batch_size);
let mut merger = merge_without_limit(batches.next().unwrap(), settings);
let tmp_file = File::create(tmp_dir.path().join(batch_number.to_string())).unwrap();
merger.write_all_to(settings, &mut BufWriter::new(tmp_file));
batch_number += 1;
}
let batch_files = (0..batch_number).map(|n| {
Box::new(File::open(tmp_dir.path().join(n.to_string())).unwrap())
as Box<dyn Read + Send>
});
if batch_number > settings.merge_batch_size {
assert!(batches.next().is_none());
merge_with_file_limit(
Box::new(batch_files) as Box<dyn ExactSizeIterator<Item = Box<dyn Read + Send>>>,
settings,
)
} else {
let final_batch = batches.next();
assert!(batches.next().is_none());
merge_without_limit(
batch_files.chain(final_batch.into_iter().flatten()),
settings,
)
}
} else {
merge_without_limit(files, settings)
}
}
/// Merge files without limiting how many files are concurrently open
///
/// It is the responsibility of the caller to ensure that `files` yields only
/// as many files as we are allowed to open concurrently.
fn merge_without_limit<F: Iterator<Item = Box<dyn Read + Send>>>(
files: F,
settings: &GlobalSettings,
) -> FileMerger {
let (request_sender, request_receiver) = channel();
let mut reader_files = Vec::with_capacity(files.len());
let mut loaded_receivers = Vec::with_capacity(files.len());
for (file_number, file) in files.iter().map(open).enumerate() {
let mut reader_files = Vec::with_capacity(files.size_hint().0);
let mut loaded_receivers = Vec::with_capacity(files.size_hint().0);
for (file_number, file) in files.enumerate() {
let (sender, receiver) = sync_channel(2);
loaded_receivers.push(receiver);
reader_files.push(ReaderFile {
@ -146,7 +197,11 @@ impl<'a> FileMerger<'a> {
/// Write the merged contents to the output file.
pub fn write_all(&mut self, settings: &GlobalSettings) {
let mut out = settings.out_writer();
while self.write_next(settings, &mut out) {}
self.write_all_to(settings, &mut out);
}
pub fn write_all_to(&mut self, settings: &GlobalSettings, out: &mut impl Write) {
while self.write_next(settings, out) {}
}
fn write_next(&mut self, settings: &GlobalSettings, out: &mut impl Write) -> bool {

View file

@ -96,6 +96,8 @@ static OPT_PARALLEL: &str = "parallel";
static OPT_FILES0_FROM: &str = "files0-from";
static OPT_BUF_SIZE: &str = "buffer-size";
static OPT_TMP_DIR: &str = "temporary-directory";
static OPT_COMPRESS_PROG: &str = "compress-program";
static OPT_BATCH_SIZE: &str = "batch-size";
static ARG_FILES: &str = "files";
@ -156,6 +158,8 @@ pub struct GlobalSettings {
zero_terminated: bool,
buffer_size: usize,
tmp_dir: PathBuf,
compress_prog: Option<String>,
merge_batch_size: usize,
}
impl GlobalSettings {
@ -223,6 +227,8 @@ impl Default for GlobalSettings {
zero_terminated: false,
buffer_size: DEFAULT_BUF_SIZE,
tmp_dir: PathBuf::new(),
compress_prog: None,
merge_batch_size: 16,
}
}
}
@ -1076,6 +1082,19 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
.takes_value(true)
.value_name("DIR"),
)
.arg(
Arg::with_name(OPT_COMPRESS_PROG)
.long(OPT_COMPRESS_PROG)
.help("compress temporary files with PROG, decompress with PROG -d")
.long_help("PROG has to take input from stdin and output to stdout")
.value_name("PROG")
)
.arg(
Arg::with_name(OPT_BATCH_SIZE)
.long(OPT_BATCH_SIZE)
.help("Merge at most N_MERGE inputs at once.")
.value_name("N_MERGE")
)
.arg(
Arg::with_name(OPT_FILES0_FROM)
.long(OPT_FILES0_FROM)
@ -1167,6 +1186,14 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
.map(PathBuf::from)
.unwrap_or_else(env::temp_dir);
settings.compress_prog = matches.value_of(OPT_COMPRESS_PROG).map(String::from);
if let Some(n_merge) = matches.value_of(OPT_BATCH_SIZE) {
settings.merge_batch_size = n_merge
.parse()
.unwrap_or_else(|_| crash!(2, "invalid --batch-size argument '{}'", n_merge));
}
settings.zero_terminated = matches.is_present(OPT_ZERO_TERMINATED);
settings.merge = matches.is_present(OPT_MERGE);
@ -1242,7 +1269,7 @@ fn output_sorted_lines<'a>(iter: impl Iterator<Item = &'a Line<'a>>, settings: &
fn exec(files: &[String], settings: &GlobalSettings) -> i32 {
if settings.merge {
let mut file_merger = merge::merge(files, settings);
let mut file_merger = merge::merge_with_file_limit(files.iter().map(open), settings);
file_merger.write_all(settings);
} else if settings.check {
if files.len() > 1 {

View file

@ -837,3 +837,64 @@ fn test_nonexistent_file() {
fn test_blanks() {
test_helper("blanks", &["-b", "--ignore-blanks"]);
}
#[test]
fn sort_multiple() {
new_ucmd!()
.args(&["no_trailing_newline1.txt", "no_trailing_newline2.txt"])
.succeeds()
.stdout_is("a\nb\nb\n");
}
#[test]
fn sort_empty_chunk() {
new_ucmd!()
.args(&["-S", "40B"])
.pipe_in("a\na\n")
.succeeds()
.stdout_is("a\na\n");
}
#[test]
#[cfg(target_os = "linux")]
fn test_compress() {
new_ucmd!()
.args(&[
"ext_sort.txt",
"-n",
"--compress-program",
"gzip",
"-S",
"10",
])
.succeeds()
.stdout_only_fixture("ext_sort.expected");
}
#[test]
fn test_compress_fail() {
new_ucmd!()
.args(&[
"ext_sort.txt",
"-n",
"--compress-program",
"nonexistent-program",
"-S",
"10",
])
.fails()
.stderr_only("sort: couldn't execute compress program: errno 2");
}
#[test]
fn test_merge_batches() {
new_ucmd!()
.args(&[
"ext_sort.txt",
"-n",
"-S",
"150B",
])
.succeeds()
.stdout_only_fixture("ext_sort.expected");
}

View file

@ -0,0 +1,2 @@
a
b

View file

@ -0,0 +1 @@
b

View file

@ -44,7 +44,7 @@ sed -i 's|"\$@|/usr/bin/timeout 600 "\$@|' build-aux/test-driver
# Change the PATH in the Makefile to test the uutils coreutils instead of the GNU coreutils
sed -i "s/^[[:blank:]]*PATH=.*/ PATH='${BUILDDIR//\//\\/}\$(PATH_SEPARATOR)'\"\$\$PATH\" \\\/" Makefile
sed -i 's| tr | /usr/bin/tr |' tests/init.sh
make
make -j "$(nproc)"
# Generate the factor tests, so they can be fixed
# Used to be 36. Reduced to 20 to decrease the log size
for i in {00..20}
@ -59,7 +59,7 @@ do
done
grep -rl 'path_prepend_' tests/* | xargs sed -i 's|path_prepend_ ./src||'
grep -rl 'path_prepend_' tests/* | xargs sed -i 's| path_prepend_ ./src||'
sed -i -e 's|^seq |/usr/bin/seq |' -e 's|sha1sum |/usr/bin/sha1sum |' tests/factor/t*sh
# Remove tests checking for --version & --help
@ -94,8 +94,28 @@ sed -i 's|cp |/usr/bin/cp |' tests/mv/hard-2.sh
sed -i 's|paste |/usr/bin/paste |' tests/misc/od-endian.sh
sed -i 's|seq |/usr/bin/seq |' tests/misc/sort-discrim.sh
#Add specific timeout to tests that currently hang to limit time spent waiting
# Add specific timeout to tests that currently hang to limit time spent waiting
sed -i 's|seq \$|/usr/bin/timeout 0.1 seq \$|' tests/misc/seq-precision.sh tests/misc/seq-long-double.sh
sed -i 's|cat |/usr/bin/timeout 0.1 cat |' tests/misc/cat-self.sh
# Remove dup of /usr/bin/ when executed several times
grep -rl '/usr/bin//usr/bin/' tests/* | xargs --no-run-if-empty sed -i 's|/usr/bin//usr/bin/|/usr/bin/|g'
#### Adjust tests to make them work with Rust/coreutils
# in some cases, what we are doing in rust/coreutils is good (or better)
# we should not regress our project just to match what GNU is going.
# So, do some changes on the fly
sed -i -e "s|rm: cannot remove 'e/slink'|rm: cannot remove 'e'|g" tests/rm/fail-eacces.sh
sed -i -e "s|rm: cannot remove 'a/b/file'|rm: cannot remove 'a'|g" tests/rm/cycle.sh
sed -i -e "s|rm: cannot remove directory 'b/a/p'|rm: cannot remove 'b'|g" tests/rm/rm1.sh
sed -i -e "s|rm: cannot remove 'a/1'|rm: cannot remove 'a'|g" tests/rm/rm2.sh
sed -i -e "s|removed directory 'a/'|removed directory 'a'|g" tests/rm/v-slash.sh
test -f "${BUILDDIR}/getlimits" || cp src/getlimits "${BUILDDIR}"