diff --git a/Cargo.lock b/Cargo.lock index 6e1b773b3..43d491cef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -44,13 +44,16 @@ dependencies = [ ] [[package]] -name = "arrayvec" -version = "0.4.12" +name = "arrayref" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd9fd44efafa8690358b7408d253adf110036b88f55672a933f01d616ad9b1b9" -dependencies = [ - "nodrop", -] +checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544" + +[[package]] +name = "arrayvec" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" [[package]] name = "atty" @@ -100,11 +103,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" [[package]] -name = "blake2-rfc" -version = "0.2.18" +name = "blake2b_simd" +version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d6d530bdd2d52966a6d03b7a964add7ae1a288d25214066fd4b600f0f796400" +checksum = "afa748e348ad3be8263be728124b24a24f268266f6f5d58af9d75f6a40b5c587" dependencies = [ + "arrayref", "arrayvec", "constant_time_eq", ] @@ -700,9 +704,9 @@ checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3" [[package]] name = "heck" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cbf45460356b7deeb5e3415b5563308c0a9b057c85e12b06ad551f98d0a6ac" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" dependencies = [ "unicode-segmentation", ] @@ -1383,12 +1387,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" -dependencies = [ - "byteorder", -] +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" [[package]] name = "regex-syntax" @@ -1501,9 +1502,9 @@ dependencies = [ [[package]] name = "signal-hook-registry" -version = "1.3.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f1d0fef1604ba8f7a073c7e701f213e056707210e9020af4528e0101ce11a6" +checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" dependencies = [ "libc", ] @@ -2029,7 +2030,7 @@ dependencies = [ name = "uu_hashsum" version = "0.0.6" dependencies = [ - "blake2-rfc", + "blake2b_simd", "clap", "digest", "hex", diff --git a/Cargo.toml b/Cargo.toml index 5f89a4077..19ebca511 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -349,7 +349,7 @@ sha1 = { version="0.6", features=["std"] } tempfile = "3.2.0" time = "0.1" unindent = "0.1" -uucore = { version=">=0.0.8", package="uucore", path="src/uucore", features=["entries"] } +uucore = { version=">=0.0.8", package="uucore", path="src/uucore", features=["entries", "process"] } walkdir = "2.2" atty = "0.2.14" diff --git a/src/uu/groups/src/groups.rs b/src/uu/groups/src/groups.rs index 5b9cd948a..07c25cebb 100644 --- a/src/uu/groups/src/groups.rs +++ b/src/uu/groups/src/groups.rs @@ -10,7 +10,7 @@ #[macro_use] extern crate uucore; -use uucore::entries::{get_groups, gid2grp, Locate, Passwd}; +use uucore::entries::{get_groups_gnu, gid2grp, Locate, Passwd}; use clap::{crate_version, App, Arg}; @@ -35,7 +35,7 @@ pub fn uumain(args: impl uucore::Args) -> i32 { None => { println!( "{}", - get_groups() + get_groups_gnu(None) .unwrap() .iter() .map(|&g| gid2grp(g).unwrap()) diff --git a/src/uu/hashsum/BENCHMARKING.md b/src/uu/hashsum/BENCHMARKING.md new file mode 100644 index 000000000..cef710a19 --- /dev/null +++ b/src/uu/hashsum/BENCHMARKING.md @@ -0,0 +1,9 @@ +## Benchmarking hashsum + +### To bench blake2 + +Taken from: https://github.com/uutils/coreutils/pull/2296 + +With a large file: +$ hyperfine "./target/release/coreutils hashsum --b2sum large-file" "b2sum large-file" + diff --git a/src/uu/hashsum/Cargo.toml b/src/uu/hashsum/Cargo.toml index 04a22cac7..11388ebf8 100644 --- a/src/uu/hashsum/Cargo.toml +++ b/src/uu/hashsum/Cargo.toml @@ -25,7 +25,7 @@ regex-syntax = "0.6.7" sha1 = "0.6.0" sha2 = "0.6.0" sha3 = "0.6.0" -blake2-rfc = "0.2.18" +blake2b_simd = "0.5.11" uucore = { version=">=0.0.8", package="uucore", path="../../uucore" } uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" } diff --git a/src/uu/hashsum/src/digest.rs b/src/uu/hashsum/src/digest.rs index 218de0a36..25bc7f4c3 100644 --- a/src/uu/hashsum/src/digest.rs +++ b/src/uu/hashsum/src/digest.rs @@ -1,4 +1,3 @@ -extern crate blake2_rfc; extern crate digest; extern crate md5; extern crate sha1; @@ -49,9 +48,9 @@ impl Digest for md5::Context { } } -impl Digest for blake2_rfc::blake2b::Blake2b { +impl Digest for blake2b_simd::State { fn new() -> Self { - blake2_rfc::blake2b::Blake2b::new(64) + Self::new() } fn input(&mut self, input: &[u8]) { @@ -59,12 +58,12 @@ impl Digest for blake2_rfc::blake2b::Blake2b { } fn result(&mut self, out: &mut [u8]) { - let hash_result = &self.clone().finalize(); + let hash_result = &self.finalize(); out.copy_from_slice(&hash_result.as_bytes()); } fn reset(&mut self) { - *self = blake2_rfc::blake2b::Blake2b::new(64); + *self = Self::new(); } fn output_bits(&self) -> usize { diff --git a/src/uu/hashsum/src/hashsum.rs b/src/uu/hashsum/src/hashsum.rs index b39b5788c..9822ca3fa 100644 --- a/src/uu/hashsum/src/hashsum.rs +++ b/src/uu/hashsum/src/hashsum.rs @@ -19,7 +19,6 @@ mod digest; use self::digest::Digest; -use blake2_rfc::blake2b::Blake2b; use clap::{App, Arg, ArgMatches}; use hex::ToHex; use md5::Context as Md5; @@ -85,7 +84,11 @@ fn detect_algo<'a>( "sha256sum" => ("SHA256", Box::new(Sha256::new()) as Box, 256), "sha384sum" => ("SHA384", Box::new(Sha384::new()) as Box, 384), "sha512sum" => ("SHA512", Box::new(Sha512::new()) as Box, 512), - "b2sum" => ("BLAKE2", Box::new(Blake2b::new(64)) as Box, 512), + "b2sum" => ( + "BLAKE2", + Box::new(blake2b_simd::State::new()) as Box, + 512, + ), "sha3sum" => match matches.value_of("bits") { Some(bits_str) => match (&bits_str).parse::() { Ok(224) => ( @@ -187,7 +190,7 @@ fn detect_algo<'a>( set_or_crash("SHA512", Box::new(Sha512::new()), 512) } if matches.is_present("b2sum") { - set_or_crash("BLAKE2", Box::new(Blake2b::new(64)), 512) + set_or_crash("BLAKE2", Box::new(blake2b_simd::State::new()), 512) } if matches.is_present("sha3") { match matches.value_of("bits") { diff --git a/src/uu/id/src/id.rs b/src/uu/id/src/id.rs index f44d77c5f..360f6a09c 100644 --- a/src/uu/id/src/id.rs +++ b/src/uu/id/src/id.rs @@ -148,7 +148,7 @@ pub fn uumain(args: impl uucore::Args) -> i32 { Arg::with_name(options::OPT_REAL_ID) .short("r") .long(options::OPT_REAL_ID) - .help("Display the real ID for the -g and -u options instead of the effective ID."), + .help("Display the real ID for the -G, -g and -u options instead of the effective ID."), ) .arg( Arg::with_name(options::OPT_ZERO) @@ -234,26 +234,23 @@ pub fn uumain(args: impl uucore::Args) -> i32 { } if gsflag { - let delimiter = if zflag { "" } else { " " }; + let delimiter = if zflag { "\0" } else { " " }; + let id = possible_pw + .map(|p| p.gid()) + .unwrap_or(if rflag { getgid() } else { getegid() }); print!( "{}{}", - if nflag { - possible_pw - .map(|p| p.belongs_to()) - .unwrap_or_else(|| entries::get_groups().unwrap()) - .iter() - .map(|&id| entries::gid2grp(id).unwrap()) - .collect::>() - .join(delimiter) - } else { - possible_pw - .map(|p| p.belongs_to()) - .unwrap_or_else(|| entries::get_groups().unwrap()) - .iter() - .map(|&id| id.to_string()) - .collect::>() - .join(delimiter) - }, + possible_pw + .map(|p| p.belongs_to()) + .unwrap_or_else(|| entries::get_groups_gnu(Some(id)).unwrap()) + .iter() + .map(|&id| if nflag { + entries::gid2grp(id).unwrap_or_else(|_| id.to_string()) + } else { + id.to_string() + }) + .collect::>() + .join(delimiter), line_ending ); return 0; @@ -321,7 +318,7 @@ fn pretty(possible_pw: Option) { println!( "groups\t{}", - entries::get_groups() + entries::get_groups_gnu(None) .unwrap() .iter() .map(|&gr| entries::gid2grp(gr).unwrap()) @@ -420,5 +417,3 @@ fn id_print(possible_pw: Option, p_euid: bool, p_egid: bool) { .join(",") ); } - -fn get_groups() -> diff --git a/src/uu/more/src/more.rs b/src/uu/more/src/more.rs index c1df9afa0..4d345e96b 100644 --- a/src/uu/more/src/more.rs +++ b/src/uu/more/src/more.rs @@ -143,7 +143,9 @@ pub fn uumain(args: impl uucore::Args) -> i32 { if let Some(files) = matches.values_of(options::FILES) { let mut stdout = setup_term(); let length = files.len(); - for (idx, file) in files.enumerate() { + + let mut files_iter = files.peekable(); + while let (Some(file), next_file) = (files_iter.next(), files_iter.peek()) { let file = Path::new(file); if file.is_dir() { terminal::disable_raw_mode().unwrap(); @@ -160,15 +162,14 @@ pub fn uumain(args: impl uucore::Args) -> i32 { } let mut reader = BufReader::new(File::open(file).unwrap()); reader.read_to_string(&mut buff).unwrap(); - let is_last = idx + 1 == length; - more(&buff, &mut stdout, is_last); + more(&buff, &mut stdout, next_file.copied()); buff.clear(); } reset_term(&mut stdout); } else if atty::isnt(atty::Stream::Stdin) { stdin().read_to_string(&mut buff).unwrap(); let mut stdout = setup_term(); - more(&buff, &mut stdout, true); + more(&buff, &mut stdout, None); reset_term(&mut stdout); } else { show_usage_error!("bad usage"); @@ -203,7 +204,7 @@ fn reset_term(stdout: &mut std::io::Stdout) { #[inline(always)] fn reset_term(_: &mut usize) {} -fn more(buff: &str, mut stdout: &mut Stdout, is_last: bool) { +fn more(buff: &str, mut stdout: &mut Stdout, next_file: Option<&str>) { let (cols, rows) = terminal::size().unwrap(); let lines = break_buff(buff, usize::from(cols)); let line_count: u16 = lines.len().try_into().unwrap(); @@ -217,8 +218,11 @@ fn more(buff: &str, mut stdout: &mut Stdout, is_last: bool) { &mut stdout, lines.clone(), line_count, + next_file, ); + let is_last = next_file.is_none(); + // Specifies whether we have reached the end of the file and should // return on the next key press. However, we immediately return when // this is the last file. @@ -270,6 +274,7 @@ fn more(buff: &str, mut stdout: &mut Stdout, is_last: bool) { &mut stdout, lines.clone(), line_count, + next_file, ); if lines_left == 0 { @@ -288,6 +293,7 @@ fn draw( mut stdout: &mut std::io::Stdout, lines: Vec, lc: u16, + next_file: Option<&str>, ) { execute!(stdout, terminal::Clear(terminal::ClearType::CurrentLine)).unwrap(); let (up_mark, lower_mark) = calc_range(*upper_mark, rows, lc); @@ -302,7 +308,7 @@ fn draw( .write_all(format!("\r{}\n", line).as_bytes()) .unwrap(); } - make_prompt_and_flush(&mut stdout, lower_mark, lc); + make_prompt_and_flush(&mut stdout, lower_mark, lc, next_file); *upper_mark = up_mark; } @@ -358,12 +364,20 @@ fn calc_range(mut upper_mark: u16, rows: u16, line_count: u16) -> (u16, u16) { } // Make a prompt similar to original more -fn make_prompt_and_flush(stdout: &mut Stdout, lower_mark: u16, lc: u16) { +fn make_prompt_and_flush(stdout: &mut Stdout, lower_mark: u16, lc: u16, next_file: Option<&str>) { + let status = if lower_mark == lc { + format!("Next file: {}", next_file.unwrap_or_default()) + } else { + format!( + "{}%", + (lower_mark as f64 / lc as f64 * 100.0).round() as u16 + ) + }; write!( stdout, - "\r{}--More--({}%){}", + "\r{}--More--({}){}", Attribute::Reverse, - ((lower_mark as f64 / lc as f64) * 100.0).round() as u16, + status, Attribute::Reset ) .unwrap(); diff --git a/src/uu/sort/src/chunks.rs b/src/uu/sort/src/chunks.rs index 23567833b..dde6febd3 100644 --- a/src/uu/sort/src/chunks.rs +++ b/src/uu/sort/src/chunks.rs @@ -102,17 +102,17 @@ pub fn read( carry_over.clear(); carry_over.extend_from_slice(&buffer[read..]); - let payload = Chunk::new(buffer, |buf| { - let mut lines = unsafe { - // SAFETY: It is safe to transmute to a vector of lines with shorter lifetime, - // because it was only temporarily transmuted to a Vec> to make recycling possible. - std::mem::transmute::>, Vec>>(lines) - }; - let read = crash_if_err!(1, std::str::from_utf8(&buf[..read])); - parse_lines(read, &mut lines, separator, &settings); - lines - }); - if !payload.borrow_lines().is_empty() { + if read != 0 { + let payload = Chunk::new(buffer, |buf| { + let mut lines = unsafe { + // SAFETY: It is safe to transmute to a vector of lines with shorter lifetime, + // because it was only temporarily transmuted to a Vec> to make recycling possible. + std::mem::transmute::>, Vec>>(lines) + }; + let read = crash_if_err!(1, std::str::from_utf8(&buf[..read])); + parse_lines(read, &mut lines, separator, &settings); + lines + }); sender.send(payload).unwrap(); } if !should_continue { @@ -175,6 +175,7 @@ fn read_to_buffer( separator: u8, ) -> (usize, bool) { let mut read_target = &mut buffer[start_offset..]; + let mut last_file_target_size = read_target.len(); loop { match file.read(read_target) { Ok(0) => { @@ -208,14 +209,27 @@ fn read_to_buffer( read_target = &mut buffer[len..]; } } else { - // This file is empty. + // This file has been fully read. + let mut leftover_len = read_target.len(); + if last_file_target_size != leftover_len { + // The file was not empty. + let read_len = buffer.len() - leftover_len; + if buffer[read_len - 1] != separator { + // The file did not end with a separator. We have to insert one. + buffer[read_len] = separator; + leftover_len -= 1; + } + let read_len = buffer.len() - leftover_len; + read_target = &mut buffer[read_len..]; + } if let Some(next_file) = next_files.next() { // There is another file. + last_file_target_size = leftover_len; *file = next_file; } else { // This was the last file. - let leftover_len = read_target.len(); - return (buffer.len() - leftover_len, false); + let read_len = buffer.len() - leftover_len; + return (read_len, false); } } } diff --git a/src/uu/sort/src/ext_sort.rs b/src/uu/sort/src/ext_sort.rs index 9b1845efa..c439adcdc 100644 --- a/src/uu/sort/src/ext_sort.rs +++ b/src/uu/sort/src/ext_sort.rs @@ -12,8 +12,12 @@ //! The buffers for the individual chunks are recycled. There are two buffers. use std::cmp::Ordering; +use std::fs::File; +use std::io::BufReader; use std::io::{BufWriter, Write}; use std::path::Path; +use std::process::Child; +use std::process::{Command, Stdio}; use std::{ fs::OpenOptions, io::Read, @@ -25,12 +29,13 @@ use itertools::Itertools; use tempfile::TempDir; +use crate::Line; use crate::{ chunks::{self, Chunk}, compare_by, merge, output_sorted_lines, sort_by, GlobalSettings, }; -const MIN_BUFFER_SIZE: usize = 8_000; +const START_BUFFER_SIZE: usize = 8_000; /// Sort files by using auxiliary files for storing intermediate chunks (if needed), and output the result. pub fn ext_sort(files: &mut impl Iterator>, settings: &GlobalSettings) { @@ -63,10 +68,31 @@ pub fn ext_sort(files: &mut impl Iterator>, settings ); match read_result { ReadResult::WroteChunksToFile { chunks_written } => { - let files = (0..chunks_written) - .map(|chunk_num| tmp_dir.path().join(chunk_num.to_string())) - .collect::>(); - let mut merger = merge::merge(&files, settings); + let mut children = Vec::new(); + let files = (0..chunks_written).map(|chunk_num| { + let file_path = tmp_dir.path().join(chunk_num.to_string()); + let file = File::open(file_path).unwrap(); + if let Some(compress_prog) = &settings.compress_prog { + let mut command = Command::new(compress_prog); + command.stdin(file).stdout(Stdio::piped()).arg("-d"); + let mut child = crash_if_err!( + 2, + command.spawn().map_err(|err| format!( + "couldn't execute compress program: errno {}", + err.raw_os_error().unwrap() + )) + ); + let child_stdout = child.stdout.take().unwrap(); + children.push(child); + Box::new(BufReader::new(child_stdout)) as Box + } else { + Box::new(BufReader::new(file)) as Box + } + }); + let mut merger = merge::merge_with_file_limit(files, settings); + for child in children { + assert_child_success(child, settings.compress_prog.as_ref().unwrap()); + } merger.write_all(settings); } ReadResult::SortedSingleChunk(chunk) => { @@ -132,7 +158,14 @@ fn reader_writer( for _ in 0..2 { chunks::read( &mut sender_option, - vec![0; MIN_BUFFER_SIZE], + vec![ + 0; + if START_BUFFER_SIZE < buffer_size { + START_BUFFER_SIZE + } else { + buffer_size + } + ], Some(buffer_size), &mut carry_over, &mut file, @@ -171,6 +204,7 @@ fn reader_writer( write( &mut chunk, &tmp_dir.path().join(file_number.to_string()), + settings.compress_prog.as_deref(), separator, ); @@ -193,14 +227,45 @@ fn reader_writer( } /// Write the lines in `chunk` to `file`, separated by `separator`. -fn write(chunk: &mut Chunk, file: &Path, separator: u8) { +/// `compress_prog` is used to optionally compress file contents. +fn write(chunk: &mut Chunk, file: &Path, compress_prog: Option<&str>, separator: u8) { chunk.with_lines_mut(|lines| { // Write the lines to the file let file = crash_if_err!(1, OpenOptions::new().create(true).write(true).open(file)); - let mut writer = BufWriter::new(file); - for s in lines.iter() { - crash_if_err!(1, writer.write_all(s.line.as_bytes())); - crash_if_err!(1, writer.write_all(&[separator])); - } + if let Some(compress_prog) = compress_prog { + let mut command = Command::new(compress_prog); + command.stdin(Stdio::piped()).stdout(file); + let mut child = crash_if_err!( + 2, + command.spawn().map_err(|err| format!( + "couldn't execute compress program: errno {}", + err.raw_os_error().unwrap() + )) + ); + let mut writer = BufWriter::new(child.stdin.take().unwrap()); + write_lines(lines, &mut writer, separator); + writer.flush().unwrap(); + drop(writer); + assert_child_success(child, compress_prog); + } else { + let mut writer = BufWriter::new(file); + write_lines(lines, &mut writer, separator); + }; }); } + +fn write_lines<'a, T: Write>(lines: &[Line<'a>], writer: &mut T, separator: u8) { + for s in lines { + crash_if_err!(1, writer.write_all(s.line.as_bytes())); + crash_if_err!(1, writer.write_all(&[separator])); + } +} + +fn assert_child_success(mut child: Child, program: &str) { + if !matches!( + child.wait().map(|e| e.code()), + Ok(Some(0)) | Ok(None) | Err(_) + ) { + crash!(2, "'{}' terminated abnormally", program) + } +} diff --git a/src/uu/sort/src/merge.rs b/src/uu/sort/src/merge.rs index 696353829..478b454b6 100644 --- a/src/uu/sort/src/merge.rs +++ b/src/uu/sort/src/merge.rs @@ -9,8 +9,8 @@ use std::{ cmp::Ordering, - ffi::OsStr, - io::{Read, Write}, + fs::File, + io::{BufWriter, Read, Write}, iter, rc::Rc, sync::mpsc::{channel, sync_channel, Receiver, Sender, SyncSender}, @@ -18,18 +18,69 @@ use std::{ }; use compare::Compare; +use itertools::Itertools; use crate::{ chunks::{self, Chunk}, - compare_by, open, GlobalSettings, + compare_by, GlobalSettings, }; // Merge already sorted files. -pub fn merge<'a>(files: &[impl AsRef], settings: &'a GlobalSettings) -> FileMerger<'a> { +pub fn merge_with_file_limit>>( + files: F, + settings: &GlobalSettings, +) -> FileMerger { + if files.len() > settings.merge_batch_size { + let tmp_dir = tempfile::Builder::new() + .prefix("uutils_sort") + .tempdir_in(&settings.tmp_dir) + .unwrap(); + let mut batch_number = 0; + let mut remaining_files = files.len(); + let batches = files.chunks(settings.merge_batch_size); + let mut batches = batches.into_iter(); + while batch_number + remaining_files > settings.merge_batch_size && remaining_files != 0 { + remaining_files = remaining_files.saturating_sub(settings.merge_batch_size); + let mut merger = merge_without_limit(batches.next().unwrap(), settings); + let tmp_file = File::create(tmp_dir.path().join(batch_number.to_string())).unwrap(); + merger.write_all_to(settings, &mut BufWriter::new(tmp_file)); + batch_number += 1; + } + let batch_files = (0..batch_number).map(|n| { + Box::new(File::open(tmp_dir.path().join(n.to_string())).unwrap()) + as Box + }); + if batch_number > settings.merge_batch_size { + assert!(batches.next().is_none()); + merge_with_file_limit( + Box::new(batch_files) as Box>>, + settings, + ) + } else { + let final_batch = batches.next(); + assert!(batches.next().is_none()); + merge_without_limit( + batch_files.chain(final_batch.into_iter().flatten()), + settings, + ) + } + } else { + merge_without_limit(files, settings) + } +} + +/// Merge files without limiting how many files are concurrently open +/// +/// It is the responsibility of the caller to ensure that `files` yields only +/// as many files as we are allowed to open concurrently. +fn merge_without_limit>>( + files: F, + settings: &GlobalSettings, +) -> FileMerger { let (request_sender, request_receiver) = channel(); - let mut reader_files = Vec::with_capacity(files.len()); - let mut loaded_receivers = Vec::with_capacity(files.len()); - for (file_number, file) in files.iter().map(open).enumerate() { + let mut reader_files = Vec::with_capacity(files.size_hint().0); + let mut loaded_receivers = Vec::with_capacity(files.size_hint().0); + for (file_number, file) in files.enumerate() { let (sender, receiver) = sync_channel(2); loaded_receivers.push(receiver); reader_files.push(ReaderFile { @@ -146,7 +197,11 @@ impl<'a> FileMerger<'a> { /// Write the merged contents to the output file. pub fn write_all(&mut self, settings: &GlobalSettings) { let mut out = settings.out_writer(); - while self.write_next(settings, &mut out) {} + self.write_all_to(settings, &mut out); + } + + pub fn write_all_to(&mut self, settings: &GlobalSettings, out: &mut impl Write) { + while self.write_next(settings, out) {} } fn write_next(&mut self, settings: &GlobalSettings, out: &mut impl Write) -> bool { diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 5825e73bd..70e3325ad 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -95,6 +95,8 @@ static OPT_PARALLEL: &str = "parallel"; static OPT_FILES0_FROM: &str = "files0-from"; static OPT_BUF_SIZE: &str = "buffer-size"; static OPT_TMP_DIR: &str = "temporary-directory"; +static OPT_COMPRESS_PROG: &str = "compress-program"; +static OPT_BATCH_SIZE: &str = "batch-size"; static ARG_FILES: &str = "files"; @@ -155,6 +157,8 @@ pub struct GlobalSettings { zero_terminated: bool, buffer_size: usize, tmp_dir: PathBuf, + compress_prog: Option, + merge_batch_size: usize, } impl GlobalSettings { @@ -223,6 +227,8 @@ impl Default for GlobalSettings { zero_terminated: false, buffer_size: DEFAULT_BUF_SIZE, tmp_dir: PathBuf::new(), + compress_prog: None, + merge_batch_size: 16, } } } @@ -1076,6 +1082,19 @@ pub fn uumain(args: impl uucore::Args) -> i32 { .takes_value(true) .value_name("DIR"), ) + .arg( + Arg::with_name(OPT_COMPRESS_PROG) + .long(OPT_COMPRESS_PROG) + .help("compress temporary files with PROG, decompress with PROG -d") + .long_help("PROG has to take input from stdin and output to stdout") + .value_name("PROG") + ) + .arg( + Arg::with_name(OPT_BATCH_SIZE) + .long(OPT_BATCH_SIZE) + .help("Merge at most N_MERGE inputs at once.") + .value_name("N_MERGE") + ) .arg( Arg::with_name(OPT_FILES0_FROM) .long(OPT_FILES0_FROM) @@ -1165,6 +1184,14 @@ pub fn uumain(args: impl uucore::Args) -> i32 { .map(PathBuf::from) .unwrap_or_else(env::temp_dir); + settings.compress_prog = matches.value_of(OPT_COMPRESS_PROG).map(String::from); + + if let Some(n_merge) = matches.value_of(OPT_BATCH_SIZE) { + settings.merge_batch_size = n_merge + .parse() + .unwrap_or_else(|_| crash!(2, "invalid --batch-size argument '{}'", n_merge)); + } + settings.zero_terminated = matches.is_present(OPT_ZERO_TERMINATED); settings.merge = matches.is_present(OPT_MERGE); @@ -1240,7 +1267,7 @@ fn output_sorted_lines<'a>(iter: impl Iterator>, settings: & fn exec(files: &[String], settings: &GlobalSettings) -> i32 { if settings.merge { - let mut file_merger = merge::merge(files, settings); + let mut file_merger = merge::merge_with_file_limit(files.iter().map(open), settings); file_merger.write_all(settings); } else if settings.check { if files.len() > 1 { diff --git a/src/uucore/src/lib/features/entries.rs b/src/uucore/src/lib/features/entries.rs index d2dce2461..b94abbe4f 100644 --- a/src/uucore/src/lib/features/entries.rs +++ b/src/uucore/src/lib/features/entries.rs @@ -5,7 +5,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore (vars) Passwd cstr fnam gecos ngroups +// spell-checker:ignore (vars) Passwd cstr fnam gecos ngroups egid //! Get password/group file entry //! @@ -72,6 +72,41 @@ pub fn get_groups() -> IOResult> { } } +/// The list of group IDs returned from GNU's `groups` and GNU's `id --groups` +/// starts with the effective group ID (egid). +/// This is a wrapper for `get_groups()` to mimic this behavior. +/// +/// If `arg_id` is `None` (default), `get_groups_gnu` moves the effective +/// group id (egid) to the first entry in the returned Vector. +/// If `arg_id` is `Some(x)`, `get_groups_gnu` moves the id with value `x` +/// to the first entry in the returned Vector. This might be necessary +/// for `id --groups --real` if `gid` and `egid` are not equal. +/// +/// From: https://www.man7.org/linux/man-pages/man3/getgroups.3p.html +/// As implied by the definition of supplementary groups, the +/// effective group ID may appear in the array returned by +/// getgroups() or it may be returned only by getegid(). Duplication +/// may exist, but the application needs to call getegid() to be sure +/// of getting all of the information. Various implementation +/// variations and administrative sequences cause the set of groups +/// appearing in the result of getgroups() to vary in order and as to +/// whether the effective group ID is included, even when the set of +/// groups is the same (in the mathematical sense of ``set''). (The +/// history of a process and its parents could affect the details of +/// the result.) +pub fn get_groups_gnu(arg_id: Option) -> IOResult> { + let mut groups = get_groups()?; + let egid = arg_id.unwrap_or_else(crate::features::process::getegid); + if !groups.is_empty() && *groups.first().unwrap() == egid { + return Ok(groups); + } else if let Some(index) = groups.iter().position(|&x| x == egid) { + groups.remove(index); + } + groups.insert(0, egid); + Ok(groups) +} + +#[derive(Copy, Clone)] pub struct Passwd { inner: passwd, } @@ -268,3 +303,18 @@ pub fn usr2uid(name: &str) -> IOResult { pub fn grp2gid(name: &str) -> IOResult { Group::locate(name).map(|p| p.gid()) } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_entries_get_groups_gnu() { + if let Ok(mut groups) = get_groups() { + if let Some(last) = groups.pop() { + groups.insert(0, last); + assert_eq!(get_groups_gnu(Some(last)).unwrap(), groups); + } + } + } +} diff --git a/tests/by-util/test_groups.rs b/tests/by-util/test_groups.rs index cee13bdc3..26ab6a75a 100644 --- a/tests/by-util/test_groups.rs +++ b/tests/by-util/test_groups.rs @@ -1,41 +1,53 @@ use crate::common::util::*; #[test] +#[cfg(any(target_vendor = "apple", target_os = "linux"))] fn test_groups() { - let result = new_ucmd!().run(); - println!("result.stdout = {}", result.stdout_str()); - println!("result.stderr = {}", result.stderr_str()); - if is_ci() && result.stdout_str().trim().is_empty() { - // In the CI, some server are failing to return the group. - // As seems to be a configuration issue, ignoring it - return; + if !is_ci() { + new_ucmd!().succeeds().stdout_is(expected_result(&[])); + } else { + // TODO: investigate how this could be tested in CI + // stderr = groups: cannot find name for group ID 116 + println!("test skipped:"); } - result.success(); - assert!(!result.stdout_str().trim().is_empty()); } #[test] -fn test_groups_arg() { - // get the username with the "id -un" command - let result = TestScenario::new("id").ucmd_keepenv().arg("-un").run(); - println!("result.stdout = {}", result.stdout_str()); - println!("result.stderr = {}", result.stderr_str()); - let s1 = String::from(result.stdout_str().trim()); - if is_ci() && s1.parse::().is_ok() { - // In the CI, some server are failing to return id -un. - // So, if we are getting a uid, just skip this test - // As seems to be a configuration issue, ignoring it +#[cfg(any(target_os = "linux"))] +#[ignore = "fixme: 'groups USERNAME' needs more debugging"] +fn test_groups_username() { + let scene = TestScenario::new(util_name!()); + let whoami_result = scene.cmd("whoami").run(); + + let username = if whoami_result.succeeded() { + whoami_result.stdout_move_str() + } else if is_ci() { + String::from("docker") + } else { + println!("test skipped:"); return; - } + }; - println!("result.stdout = {}", result.stdout_str()); - println!("result.stderr = {}", result.stderr_str()); - result.success(); - assert!(!result.stdout_str().is_empty()); - let username = result.stdout_str().trim(); + // TODO: stdout should be in the form: "username : group1 group2 group3" - // call groups with the user name to check that we - // are getting something - new_ucmd!().arg(username).succeeds(); - assert!(!result.stdout_str().is_empty()); + scene + .ucmd() + .arg(&username) + .succeeds() + .stdout_is(expected_result(&[&username])); +} + +#[cfg(any(target_vendor = "apple", target_os = "linux"))] +fn expected_result(args: &[&str]) -> String { + #[cfg(target_os = "linux")] + let util_name = util_name!(); + #[cfg(target_vendor = "apple")] + let util_name = format!("g{}", util_name!()); + + TestScenario::new(&util_name) + .cmd_keepenv(util_name) + .env("LANGUAGE", "C") + .args(args) + .succeeds() + .stdout_move_str() } diff --git a/tests/by-util/test_id.rs b/tests/by-util/test_id.rs index a8ad37190..a9c7e31ae 100644 --- a/tests/by-util/test_id.rs +++ b/tests/by-util/test_id.rs @@ -112,28 +112,23 @@ fn test_id_group() { } #[test] +#[cfg(any(target_vendor = "apple", target_os = "linux"))] fn test_id_groups() { let scene = TestScenario::new(util_name!()); - - let result = scene.ucmd().arg("-G").succeeds(); - let groups = result.stdout_str().trim().split_whitespace(); - for s in groups { - assert!(s.parse::().is_ok()); - } - - let result = scene.ucmd().arg("--groups").succeeds(); - let groups = result.stdout_str().trim().split_whitespace(); - for s in groups { - assert!(s.parse::().is_ok()); - } - - #[cfg(any(target_vendor = "apple", target_os = "linux"))] - for args in &["-G", "--groups"] { - let expect = expected_result(&[args], false); - let actual = new_ucmd!().arg(&args).succeeds().stdout_move_str(); - let mut v_actual: Vec<&str> = actual.split_whitespace().collect(); - let mut v_expect: Vec<&str> = expect.split_whitespace().collect(); - assert_eq!(v_actual.sort_unstable(), v_expect.sort_unstable()); + for g_flag in &["-G", "--groups"] { + scene + .ucmd() + .arg(g_flag) + .succeeds() + .stdout_is(expected_result(&[g_flag], false)); + for &r_flag in &["-r", "--real"] { + let args = [g_flag, r_flag]; + scene + .ucmd() + .args(&args) + .succeeds() + .stdout_is(expected_result(&args, false)); + } } } @@ -196,26 +191,28 @@ fn test_id_password_style() { #[test] #[cfg(any(target_vendor = "apple", target_os = "linux"))] fn test_id_default_format() { + let scene = TestScenario::new(util_name!()); // -ugG for flag in &["--name", "--real"] { - new_ucmd!() + scene + .ucmd() .arg(flag) .fails() .stderr_is(expected_result(&[flag], true)); for &opt in &["--user", "--group", "--groups"] { if is_ci() && *flag == "--name" { - // '--name' does not work in CI: + // '--name' does not work on CICD ubuntu-16/ubuntu-18 // id: cannot find name for user ID 1001 // id: cannot find name for group ID 116 - println!("test skipped:"); + println!("test skipped"); continue; } let args = [opt, flag]; - let expect = expected_result(&args, false); - let actual = new_ucmd!().args(&args).succeeds().stdout_move_str(); - let mut v_actual: Vec<&str> = actual.split_whitespace().collect(); - let mut v_expect: Vec<&str> = expect.split_whitespace().collect(); - assert_eq!(v_actual.sort_unstable(), v_expect.sort_unstable()); + scene + .ucmd() + .args(&args) + .succeeds() + .stdout_is(expected_result(&args, false)); } } } @@ -231,21 +228,13 @@ fn test_id_zero() { .fails() .stderr_is(expected_result(&args, true)); } - for &opt in &["-u", "--user", "-g", "--group"] { + for &opt in &["-u", "--user", "-g", "--group", "-G", "--groups"] { let args = [opt, z_flag]; new_ucmd!() .args(&args) .succeeds() .stdout_is(expected_result(&args, false)); } - // '--groups' ids are in no particular order and when paired with '--zero' there's no - // delimiter which makes the split_whitespace-collect-into-vector comparison impossible. - for opt in &["-G", "--groups"] { - let args = [opt, z_flag]; - let result = new_ucmd!().args(&args).succeeds().stdout_move_str(); - assert!(!result.contains(" ")); - assert!(result.ends_with('\0')); - } } } @@ -271,7 +260,6 @@ fn expected_result(args: &[&str], exp_fail: bool) -> String { .fails() .stderr_move_str() }; - // #[cfg(target_vendor = "apple")] return if cfg!(target_os = "macos") && result.starts_with("gid") { result[1..].to_string() } else { diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index 02636b027..75611abfc 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -792,3 +792,64 @@ fn test_nonexistent_file() { fn test_blanks() { test_helper("blanks", &["-b", "--ignore-blanks"]); } + +#[test] +fn sort_multiple() { + new_ucmd!() + .args(&["no_trailing_newline1.txt", "no_trailing_newline2.txt"]) + .succeeds() + .stdout_is("a\nb\nb\n"); +} + +#[test] +fn sort_empty_chunk() { + new_ucmd!() + .args(&["-S", "40B"]) + .pipe_in("a\na\n") + .succeeds() + .stdout_is("a\na\n"); +} + +#[test] +#[cfg(target_os = "linux")] +fn test_compress() { + new_ucmd!() + .args(&[ + "ext_sort.txt", + "-n", + "--compress-program", + "gzip", + "-S", + "10", + ]) + .succeeds() + .stdout_only_fixture("ext_sort.expected"); +} + +#[test] +fn test_compress_fail() { + new_ucmd!() + .args(&[ + "ext_sort.txt", + "-n", + "--compress-program", + "nonexistent-program", + "-S", + "10", + ]) + .fails() + .stderr_only("sort: couldn't execute compress program: errno 2"); +} + +#[test] +fn test_merge_batches() { + new_ucmd!() + .args(&[ + "ext_sort.txt", + "-n", + "-S", + "150B", + ]) + .succeeds() + .stdout_only_fixture("ext_sort.expected"); +} diff --git a/tests/fixtures/sort/no_trailing_newline1.txt b/tests/fixtures/sort/no_trailing_newline1.txt new file mode 100644 index 000000000..0a207c060 --- /dev/null +++ b/tests/fixtures/sort/no_trailing_newline1.txt @@ -0,0 +1,2 @@ +a +b \ No newline at end of file diff --git a/tests/fixtures/sort/no_trailing_newline2.txt b/tests/fixtures/sort/no_trailing_newline2.txt new file mode 100644 index 000000000..63d8dbd40 --- /dev/null +++ b/tests/fixtures/sort/no_trailing_newline2.txt @@ -0,0 +1 @@ +b \ No newline at end of file