From 2658f8ae5badb3a5e55968ec396a29c891e0c795 Mon Sep 17 00:00:00 2001 From: Joining7943 <111500881+Joining7943@users.noreply.github.com> Date: Fri, 9 Sep 2022 13:50:59 +0200 Subject: [PATCH] tail: improve performance of piped stdin Rewrite handling of stdin when it is piped and read input in chunks. Fixes https://github.com/uutils/coreutils/issues/3842 --- Cargo.lock | 1 + src/uu/tail/Cargo.toml | 1 + src/uu/tail/src/chunks.rs | 618 ++++++++++++++++++++++++++++++- src/uu/tail/src/tail.rs | 119 +++--- tests/by-util/test_tail.rs | 733 ++++++++++++++++++++++++++++++++++++- tests/common/mod.rs | 1 + tests/common/random.rs | 314 ++++++++++++++++ 7 files changed, 1704 insertions(+), 83 deletions(-) create mode 100644 tests/common/random.rs diff --git a/Cargo.lock b/Cargo.lock index d0bf1c93c..25f8c6aa6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2871,6 +2871,7 @@ version = "0.0.15" dependencies = [ "clap", "libc", + "memchr", "nix", "notify", "same-file", diff --git a/src/uu/tail/Cargo.toml b/src/uu/tail/Cargo.toml index 8e12beafa..dd5747cc4 100644 --- a/src/uu/tail/Cargo.toml +++ b/src/uu/tail/Cargo.toml @@ -18,6 +18,7 @@ path = "src/tail.rs" [dependencies] clap = { version = "3.2", features = ["wrap_help", "cargo"] } libc = "0.2.132" +memchr = "2.5.0" notify = { version = "=5.0.0-pre.16", features=["macos_kqueue"]} uucore = { version=">=0.0.15", package="uucore", path="../../uucore", features=["ringbuffer", "lines"] } same-file = "1.0.6" diff --git a/src/uu/tail/src/chunks.rs b/src/uu/tail/src/chunks.rs index 0ba64540a..8fb53c769 100644 --- a/src/uu/tail/src/chunks.rs +++ b/src/uu/tail/src/chunks.rs @@ -1,14 +1,29 @@ -//! Iterating over a file by chunks, starting at the end of the file. +// * This file is part of the uutils coreutils package. +// * +// * For the full copyright and license information, please view the LICENSE +// * file that was distributed with this source code. + +//! Iterating over a file by chunks, either starting at the end of the file with [`ReverseChunks`] +//! or at the end of piped stdin with [`LinesChunk`] or [`BytesChunk`]. //! -//! Use [`ReverseChunks::new`] to create a new iterator over chunks of -//! bytes from the file. +//! Use [`ReverseChunks::new`] to create a new iterator over chunks of bytes from the file. +// spell-checker:ignore (ToDO) filehandle BUFSIZ +use std::collections::VecDeque; use std::fs::File; -use std::io::{Read, Seek, SeekFrom}; +use std::io::{BufReader, Read, Seek, SeekFrom, Write}; +use uucore::error::UResult; /// When reading files in reverse in `bounded_tail`, this is the size of each /// block read at a time. pub const BLOCK_SIZE: u64 = 1 << 16; +/// The size of the backing buffer of a LinesChunk or BytesChunk in bytes. The value of BUFFER_SIZE +/// originates from the BUFSIZ constant in stdio.h and the libc crate to make stream IO efficient. +/// In the latter the value is constantly set to 8192 on all platforms, where the value in stdio.h +/// is determined on each platform differently. Since libc chose 8192 as a reasonable default the +/// value here is set to this value, too. +pub const BUFFER_SIZE: usize = 8192; + /// An iterator over a file in non-overlapping chunks from the end of the file. /// /// Each chunk is a [`Vec`]<[`u8`]> of size [`BLOCK_SIZE`] (except @@ -86,3 +101,598 @@ impl<'a> Iterator for ReverseChunks<'a> { Some(buf[0..(block_size as usize)].to_vec()) } } + +/// The type of the backing buffer of [`BytesChunk`] and [`LinesChunk`] which can hold +/// [`BUFFER_SIZE`] elements at max. +type ChunkBuffer = [u8; BUFFER_SIZE]; + +/// A [`BytesChunk`] storing a fixed size number of bytes in a buffer. +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct BytesChunk { + /// The [`ChunkBuffer`], an array storing the bytes, for example filled by + /// [`BytesChunk::fill`] + buffer: ChunkBuffer, + + /// Stores the number of bytes, this buffer holds. This is not equal to buffer.len(), since the + /// [`BytesChunk`] may store less bytes than the internal buffer can hold. In addition + /// [`BytesChunk`] may be reused, what makes it necessary to track the number of stored bytes. + /// The choice of usize is sufficient here, since the number of bytes max value is + /// [`BUFFER_SIZE`], which is a usize. + bytes: usize, +} + +impl BytesChunk { + #[allow(clippy::new_without_default)] + pub fn new() -> Self { + Self { + buffer: [0; BUFFER_SIZE], + bytes: 0, + } + } + + /// Create a new chunk from an existing chunk. The new chunk's buffer will be copied from the + /// old chunk's buffer, copying the slice `[offset..old_chunk.bytes]` into the new chunk's + /// buffer but starting at 0 instead of offset. If the offset is larger or equal to + /// `chunk.lines` then a new empty `BytesChunk` is returned. + /// + /// # Arguments + /// + /// * `chunk`: The chunk to create a new `BytesChunk` chunk from + /// * `offset`: Start to copy the old chunk's buffer from this position. May not be larger + /// than `chunk.bytes`. + /// + /// # Examples + /// + /// ```rust,ignore + /// let mut chunk = BytesChunk::new(); + /// chunk.buffer[1] = 1; + /// chunk.bytes = 2; + /// let new_chunk = BytesChunk::from_chunk(&chunk, 0); + /// assert_eq!(2, new_chunk.get_buffer().len()); + /// assert_eq!(&[0, 1], new_chunk.get_buffer()); + /// + /// let new_chunk = BytesChunk::from_chunk(&chunk, 1); + /// assert_eq!(1, new_chunk.get_buffer().len()); + /// assert_eq!(&[1], new_chunk.get_buffer()); + /// ``` + fn from_chunk(chunk: &Self, offset: usize) -> Self { + if offset >= chunk.bytes { + return Self::new(); + } + + let mut buffer: ChunkBuffer = [0; BUFFER_SIZE]; + let slice = chunk.get_buffer_with(offset); + buffer[..slice.len()].copy_from_slice(slice); + Self { + buffer, + bytes: chunk.bytes - offset, + } + } + + /// Receive the internal buffer safely, so it returns a slice only containing as many bytes as + /// large the `self.bytes` value is. + /// + /// returns: a slice containing the bytes of the internal buffer from `[0..self.bytes]` + /// + /// # Examples + /// + /// ```rust,ignore + /// let mut chunk = BytesChunk::new(); + /// chunk.bytes = 1; + /// assert_eq!(&[0], chunk.get_buffer()); + /// ``` + pub fn get_buffer(&self) -> &[u8] { + &self.buffer[..self.bytes] + } + + /// Like [`BytesChunk::get_buffer`], but returning a slice from `[offset.self.bytes]`. + /// + /// returns: a slice containing the bytes of the internal buffer from `[offset..self.bytes]` + /// + /// # Examples + /// + /// ```rust,ignore + /// let mut chunk = BytesChunk::new(); + /// chunk.bytes = 2; + /// assert_eq!(&[0], chunk.get_buffer_with(1)); + /// ``` + pub fn get_buffer_with(&self, offset: usize) -> &[u8] { + &self.buffer[offset..self.bytes] + } + + pub fn has_data(&self) -> bool { + self.bytes > 0 + } + + /// Fills `self.buffer` with maximal [`BUFFER_SIZE`] number of bytes, draining the reader by + /// that number of bytes. If EOF is reached (so 0 bytes are read), then returns + /// [`UResult`] or else the result with [`Some(bytes)`] where bytes is the number of bytes + /// read from the source. + pub fn fill(&mut self, filehandle: &mut BufReader) -> UResult> { + let num_bytes = filehandle.read(&mut self.buffer)?; + self.bytes = num_bytes; + if num_bytes == 0 { + return Ok(None); + } + + Ok(Some(self.bytes)) + } +} + +/// An abstraction layer on top of [`BytesChunk`] mainly to simplify filling only the needed amount +/// of chunks. See also [`Self::fill`]. +pub struct BytesChunkBuffer { + /// The number of bytes to print + num_print: u64, + /// The current number of bytes summed over all stored chunks in [`Self::chunks`]. Use u64 here + /// to support files > 4GB on 32-bit systems. Note, this differs from `BytesChunk::bytes` which + /// is a usize. The choice of u64 is based on `tail::FilterMode::Bytes`. + bytes: u64, + /// The buffer to store [`BytesChunk`] in + chunks: VecDeque>, +} + +impl BytesChunkBuffer { + /// Creates a new [`BytesChunkBuffer`]. + /// + /// # Arguments + /// + /// * `num_print`: The number of bytes to print + /// + /// # Examples + /// + /// ```rust,ignore + /// let mut chunk = BytesChunk::new(); + /// chunk.buffer[1] = 1; + /// chunk.bytes = 2; + /// let new_chunk = BytesChunk::from_chunk(&chunk, 0); + /// assert_eq!(2, new_chunk.get_buffer().len()); + /// assert_eq!(&[0, 1], new_chunk.get_buffer()); + /// + /// let new_chunk = BytesChunk::from_chunk(&chunk, 1); + /// assert_eq!(1, new_chunk.get_buffer().len()); + /// assert_eq!(&[1], new_chunk.get_buffer()); + /// ``` + pub fn new(num_print: u64) -> Self { + Self { + bytes: 0, + num_print, + chunks: VecDeque::new(), + } + } + + /// Fills this buffer with chunks and consumes the reader completely. This method ensures that + /// there are exactly as many chunks as needed to match `self.num_print` bytes, so there are + /// in sum exactly `self.num_print` bytes stored in all chunks. The method returns an iterator + /// over these chunks. If there are no chunks, for example because the piped stdin contained no + /// bytes, or `num_print = 0` then `iterator.next` returns None. + /// + /// # Examples + /// + /// ```rust,ignore + /// use crate::chunks::BytesChunkBuffer; + /// use std::io::{BufReader, Cursor}; + /// + /// let mut reader = BufReader::new(Cursor::new("")); + /// let num_print = 0; + /// let mut chunks = BytesChunkBuffer::new(num_print); + /// chunks.fill(&mut reader).unwrap(); + /// + /// let mut reader = BufReader::new(Cursor::new("a")); + /// let num_print = 1; + /// let mut chunks = BytesChunkBuffer::new(num_print); + /// chunks.fill(&mut reader).unwrap(); + /// ``` + pub fn fill(&mut self, reader: &mut BufReader) -> UResult<()> { + let mut chunk = Box::new(BytesChunk::new()); + + // fill chunks with all bytes from reader and reuse already instantiated chunks if possible + while (chunk.fill(reader)?).is_some() { + self.bytes += chunk.bytes as u64; + self.chunks.push_back(chunk); + + let first = &self.chunks[0]; + if self.bytes - first.bytes as u64 > self.num_print { + chunk = self.chunks.pop_front().unwrap(); + self.bytes -= chunk.bytes as u64; + } else { + chunk = Box::new(BytesChunk::new()); + } + } + + // quit early if there are no chunks for example in case the pipe was empty + if self.chunks.is_empty() { + return Ok(()); + } + + let chunk = self.chunks.pop_front().unwrap(); + + // calculate the offset in the first chunk and put the calculated chunk as first element in + // the self.chunks collection. The calculated offset must be in the range 0 to BUFFER_SIZE + // and is therefore safely convertible to a usize without losses. + let offset = self.bytes.saturating_sub(self.num_print) as usize; + self.chunks + .push_front(Box::new(BytesChunk::from_chunk(&chunk, offset))); + + Ok(()) + } + + pub fn print(&self, mut writer: impl Write) -> UResult<()> { + for chunk in &self.chunks { + writer.write_all(chunk.get_buffer())?; + } + Ok(()) + } +} + +/// Works similar to a [`BytesChunk`] but also stores the number of lines encountered in the current +/// buffer. The size of the buffer is limited to a fixed size number of bytes. +#[derive(Debug)] +pub struct LinesChunk { + /// Work on top of a [`BytesChunk`] + chunk: BytesChunk, + /// The number of lines delimited by `delimiter`. The choice of usize is sufficient here, + /// because lines max value is the number of bytes contained in this chunk's buffer, and the + /// number of bytes max value is [`BUFFER_SIZE`], which is a usize. + lines: usize, + /// The delimiter to use, to count the lines + delimiter: u8, +} + +impl LinesChunk { + pub fn new(delimiter: u8) -> Self { + Self { + chunk: BytesChunk::new(), + lines: 0, + delimiter, + } + } + + /// Count the number of lines delimited with [`Self::delimiter`] contained in the buffer. + /// Currently [`memchr`] is used because performance is better than using an iterator or for + /// loop. + /// + /// # Examples + /// + /// ```rust,ignore + /// let mut chunk = LinesChunk::new(b'\n'); + /// chunk.buffer[0..12].copy_from_slice("hello\nworld\n".as_bytes()); + /// chunk.bytes = 12; + /// assert_eq!(2, chunk.count_lines()); + /// + /// chunk.buffer[0..14].copy_from_slice("hello\r\nworld\r\n".as_bytes()); + /// chunk.bytes = 14; + /// assert_eq!(2, chunk.count_lines()); + /// ``` + fn count_lines(&self) -> usize { + memchr::memchr_iter(self.delimiter, self.get_buffer()).count() + } + + /// Creates a new [`LinesChunk`] from an existing one with an offset in lines. The new chunk + /// contains exactly `chunk.lines - offset` lines. The offset in bytes is calculated and applied + /// to the new chunk, so the new chunk contains only the bytes encountered after the offset in + /// number of lines and the `delimiter`. If the offset is larger than `chunk.lines` then a new + /// empty `LinesChunk` is returned. + /// + /// # Arguments + /// + /// * `chunk`: The chunk to create the new chunk from + /// * `offset`: The offset in number of lines (not bytes) + /// + /// # Examples + /// + /// ```rust,ignore + /// let mut chunk = LinesChunk::new(b'\n'); + /// // manually filling the buffer and setting the correct values for bytes and lines + /// chunk.buffer[0..12].copy_from_slice("hello\nworld\n".as_bytes()); + /// chunk.bytes = 12; + /// chunk.lines = 2; + /// + /// let offset = 1; // offset in number of lines + /// let new_chunk = LinesChunk::from(&chunk, offset); + /// assert_eq!("world\n".as_bytes(), new_chunk.get_buffer()); + /// assert_eq!(6, new_chunk.bytes); + /// assert_eq!(1, new_chunk.lines); + /// ``` + fn from_chunk(chunk: &Self, offset: usize) -> Self { + if offset > chunk.lines { + return Self::new(chunk.delimiter); + } + + let bytes_offset = chunk.calculate_bytes_offset_from(offset); + let new_chunk = BytesChunk::from_chunk(&chunk.chunk, bytes_offset); + + Self { + chunk: new_chunk, + lines: chunk.lines - offset, + delimiter: chunk.delimiter, + } + } + + /// Returns true if this buffer has stored any bytes. + /// + /// # Examples + /// + /// ```rust,ignore + /// let mut chunk = LinesChunk::new(b'\n'); + /// assert!(!chunk.has_data()); + /// + /// chunk.buffer[0] = 1; + /// assert!(!chunk.has_data()); + /// + /// chunk.bytes = 1; + /// assert!(chunk.has_data()); + /// ``` + pub fn has_data(&self) -> bool { + self.chunk.has_data() + } + + /// Returns this buffer safely. See [`BytesChunk::get_buffer`] + /// + /// returns: &[u8] with length `self.bytes` + pub fn get_buffer(&self) -> &[u8] { + self.chunk.get_buffer() + } + + /// Returns this buffer safely with an offset applied. See [`BytesChunk::get_buffer_with`]. + /// + /// returns: &[u8] with length `self.bytes - offset` + pub fn get_buffer_with(&self, offset: usize) -> &[u8] { + self.chunk.get_buffer_with(offset) + } + + /// Return the number of lines the buffer contains. `self.lines` needs to be set before the call + /// to this function returns the correct value. If the calculation of lines is needed then + /// use `self.count_lines`. + pub fn get_lines(&self) -> usize { + self.lines + } + + /// Fills `self.buffer` with maximal [`BUFFER_SIZE`] number of bytes, draining the reader by + /// that number of bytes. This function works like the [`BytesChunk::fill`] function besides + /// that this function also counts and stores the number of lines encountered while reading from + /// the `filehandle`. + pub fn fill(&mut self, filehandle: &mut BufReader) -> UResult> { + match self.chunk.fill(filehandle)? { + None => { + self.lines = 0; + Ok(None) + } + Some(bytes) => { + self.lines = self.count_lines(); + Ok(Some(bytes)) + } + } + } + + /// Calculates the offset in bytes within this buffer from the offset in number of lines. The + /// resulting offset is 0-based and points to the byte after the delimiter. + /// + /// # Arguments + /// + /// * `offset`: the offset in number of lines. If offset is 0 then 0 is returned, if larger than + /// the contained lines then self.bytes is returned. + /// + /// # Examples + /// + /// ```rust,ignore + /// let mut chunk = LinesChunk::new(b'\n'); + /// chunk.buffer[0..12].copy_from_slice("hello\nworld\n".as_bytes()); + /// chunk.bytes = 12; + /// chunk.lines = 2; // note that if not setting lines the result might not be what is expected + /// let bytes_offset = chunk.calculate_bytes_offset_from(1); + /// assert_eq!(6, bytes_offset); + /// assert_eq!( + /// "world\n", + /// String::from_utf8_lossy(chunk.get_buffer_with(bytes_offset))); + /// ``` + fn calculate_bytes_offset_from(&self, offset: usize) -> usize { + let mut lines_offset = offset; + let mut bytes_offset = 0; + for byte in self.get_buffer().iter() { + if lines_offset == 0 { + break; + } + if byte == &self.delimiter { + lines_offset -= 1; + } + bytes_offset += 1; + } + bytes_offset + } + + /// Print the bytes contained in this buffer calculated with the given offset in number of + /// lines. + /// + /// # Arguments + /// + /// * `writer`: must implement [`Write`] + /// * `offset`: An offset in number of lines. + pub fn print_lines(&self, writer: &mut impl Write, offset: usize) -> UResult<()> { + self.print_bytes(writer, self.calculate_bytes_offset_from(offset)) + } + + /// Print the bytes contained in this buffer beginning from the given offset in number of bytes. + /// + /// # Arguments + /// + /// * `writer`: must implement [`Write`] + /// * `offset`: An offset in number of bytes. + pub fn print_bytes(&self, writer: &mut impl Write, offset: usize) -> UResult<()> { + writer.write_all(self.get_buffer_with(offset))?; + Ok(()) + } +} + +/// An abstraction layer on top of [`LinesChunk`] mainly to simplify filling only the needed amount +/// of chunks. See also [`Self::fill`]. Works similar like [`BytesChunkBuffer`], but works on top +/// of lines delimited by `self.delimiter` instead of bytes. +pub struct LinesChunkBuffer { + /// The delimiter to recognize a line. Any [`u8`] is allowed. + delimiter: u8, + /// The amount of lines occurring in all currently stored [`LinesChunk`]s. Use u64 here to + /// support files > 4GB on 32-bit systems. Note, this differs from [`LinesChunk::lines`] which + /// is a usize. The choice of u64 is based on `tail::FilterMode::Lines`. + lines: u64, + /// The amount of lines to print. + num_print: u64, + /// Stores the [`LinesChunk`] + chunks: VecDeque>, +} + +impl LinesChunkBuffer { + /// Create a new [`LinesChunkBuffer`] + pub fn new(delimiter: u8, num_print: u64) -> Self { + Self { + delimiter, + num_print, + lines: 0, + chunks: VecDeque::new(), + } + } + + /// Fills this buffer with chunks and consumes the reader completely. This method ensures that + /// there are exactly as many chunks as needed to match `self.num_print` lines, so there are + /// in sum exactly `self.num_print` lines stored in all chunks. The method returns an iterator + /// over these chunks. If there are no chunks, for example because the piped stdin contained no + /// lines, or `num_print = 0` then `iterator.next` will return None. + pub fn fill(&mut self, reader: &mut BufReader) -> UResult<()> { + let mut chunk = Box::new(LinesChunk::new(self.delimiter)); + + while (chunk.fill(reader)?).is_some() { + self.lines += chunk.lines as u64; + self.chunks.push_back(chunk); + + let first = &self.chunks[0]; + if self.lines - first.lines as u64 > self.num_print { + chunk = self.chunks.pop_front().unwrap(); + + self.lines -= chunk.lines as u64; + } else { + chunk = Box::new(LinesChunk::new(self.delimiter)); + } + } + + if !&self.chunks.is_empty() { + let length = &self.chunks.len(); + let last = &mut self.chunks[length - 1]; + if !last.get_buffer().ends_with(&[self.delimiter]) { + last.lines += 1; + self.lines += 1; + } + } else { + // chunks is empty when a file is empty so quitting early here + return Ok(()); + } + + // skip unnecessary chunks and save the first chunk which may hold some lines we have to + // print + let chunk = loop { + // it's safe to call unwrap here because there is at least one chunk and sorting out + // more chunks than exist shouldn't be possible. + let chunk = self.chunks.pop_front().unwrap(); + + // skip is true as long there are enough lines left in the other stored chunks. + let skip = self.lines - chunk.lines as u64 > self.num_print; + if skip { + self.lines -= chunk.lines as u64; + } else { + break chunk; + } + }; + + // Calculate the number of lines to skip in the current chunk. The calculated value must be + // in the range 0 to BUFFER_SIZE and is therefore safely convertible to a usize without + // losses. + let skip_lines = self.lines.saturating_sub(self.num_print) as usize; + let chunk = LinesChunk::from_chunk(&chunk, skip_lines); + self.chunks.push_front(Box::new(chunk)); + + Ok(()) + } + + pub fn print(&self, mut writer: impl Write) -> UResult<()> { + for chunk in &self.chunks { + chunk.print_bytes(&mut writer, 0)?; + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::chunks::{BytesChunk, BUFFER_SIZE}; + + #[test] + fn test_bytes_chunk_from_when_offset_is_zero() { + let mut chunk = BytesChunk::new(); + chunk.bytes = BUFFER_SIZE; + chunk.buffer[1] = 1; + let other = BytesChunk::from_chunk(&chunk, 0); + assert_eq!(other, chunk); + + chunk.bytes = 2; + let other = BytesChunk::from_chunk(&chunk, 0); + assert_eq!(other, chunk); + + chunk.bytes = 1; + let other = BytesChunk::from_chunk(&chunk, 0); + assert_eq!(other.buffer, [0; BUFFER_SIZE]); + assert_eq!(other.bytes, chunk.bytes); + + chunk.bytes = BUFFER_SIZE; + let other = BytesChunk::from_chunk(&chunk, 2); + assert_eq!(other.buffer, [0; BUFFER_SIZE]); + assert_eq!(other.bytes, BUFFER_SIZE - 2); + } + + #[test] + fn test_bytes_chunk_from_when_offset_is_not_zero() { + let mut chunk = BytesChunk::new(); + chunk.bytes = BUFFER_SIZE; + chunk.buffer[1] = 1; + + let other = BytesChunk::from_chunk(&chunk, 1); + let mut expected_buffer = [0; BUFFER_SIZE]; + expected_buffer[0] = 1; + assert_eq!(other.buffer, expected_buffer); + assert_eq!(other.bytes, BUFFER_SIZE - 1); + + let other = BytesChunk::from_chunk(&chunk, 2); + assert_eq!(other.buffer, [0; BUFFER_SIZE]); + assert_eq!(other.bytes, BUFFER_SIZE - 2); + } + + #[test] + fn test_bytes_chunk_from_when_offset_is_larger_than_chunk_size_1() { + let mut chunk = BytesChunk::new(); + chunk.bytes = BUFFER_SIZE; + let new_chunk = BytesChunk::from_chunk(&chunk, BUFFER_SIZE + 1); + assert_eq!(0, new_chunk.bytes); + } + + #[test] + fn test_bytes_chunk_from_when_offset_is_larger_than_chunk_size_2() { + let mut chunk = BytesChunk::new(); + chunk.bytes = 0; + let new_chunk = BytesChunk::from_chunk(&chunk, 1); + assert_eq!(0, new_chunk.bytes); + } + + #[test] + fn test_bytes_chunk_from_when_offset_is_larger_than_chunk_size_3() { + let mut chunk = BytesChunk::new(); + chunk.bytes = 1; + let new_chunk = BytesChunk::from_chunk(&chunk, 2); + assert_eq!(0, new_chunk.bytes); + } + + #[test] + fn test_bytes_chunk_from_when_offset_is_equal_to_chunk_size() { + let mut chunk = BytesChunk::new(); + chunk.buffer[0] = 1; + chunk.bytes = 1; + let new_chunk = BytesChunk::from_chunk(&chunk, 1); + assert_eq!(0, new_chunk.bytes); + } +} diff --git a/src/uu/tail/src/tail.rs b/src/uu/tail/src/tail.rs index 28a65093d..d8442f09b 100644 --- a/src/uu/tail/src/tail.rs +++ b/src/uu/tail/src/tail.rs @@ -7,7 +7,7 @@ // * For the full copyright and license information, please view the LICENSE // * file that was distributed with this source code. -// spell-checker:ignore (ToDO) seekable seek'd tail'ing ringbuffer ringbuf unwatch Uncategorized +// spell-checker:ignore (ToDO) seekable seek'd tail'ing ringbuffer ringbuf unwatch Uncategorized filehandle // spell-checker:ignore (libs) kqueue // spell-checker:ignore (acronyms) // spell-checker:ignore (env/flags) @@ -21,8 +21,9 @@ extern crate clap; #[macro_use] extern crate uucore; +extern crate core; -mod chunks; +pub mod chunks; mod parse; mod platform; use crate::files::FileHandling; @@ -30,11 +31,11 @@ use chunks::ReverseChunks; use clap::{Arg, Command, ValueSource}; use notify::{RecommendedWatcher, RecursiveMode, Watcher, WatcherKind}; +use std::cmp::Ordering; use std::collections::{HashMap, VecDeque}; use std::ffi::OsString; -use std::fmt; use std::fs::{File, Metadata}; -use std::io::{stdin, stdout, BufRead, BufReader, Read, Seek, SeekFrom, Write}; +use std::io::{self, stdin, stdout, BufRead, BufReader, BufWriter, Read, Seek, SeekFrom, Write}; use std::path::{Path, PathBuf}; use std::sync::mpsc::{self, channel, Receiver}; use std::time::Duration; @@ -43,9 +44,7 @@ use uucore::error::{ get_exit_code, set_exit_code, FromIo, UError, UResult, USimpleError, UUsageError, }; use uucore::format_usage; -use uucore::lines::lines; use uucore::parse_size::{parse_size, ParseSizeError}; -use uucore::ringbuffer::RingBuffer; #[cfg(unix)] use std::os::unix::fs::MetadataExt; @@ -1458,70 +1457,58 @@ fn bounded_tail(file: &mut File, settings: &Settings) { std::io::copy(file, &mut stdout).unwrap(); } -/// An alternative to [`Iterator::skip`] with u64 instead of usize. This is -/// necessary because the usize limit doesn't make sense when iterating over -/// something that's not in memory. For example, a very large file. This allows -/// us to skip data larger than 4 GiB even on 32-bit platforms. -fn skip_u64(iter: &mut impl Iterator, num: u64) { - for _ in 0..num { - if iter.next().is_none() { - break; - } - } -} - -/// Collect the last elements of an iterator into a `VecDeque`. -/// -/// This function returns a [`VecDeque`] containing either the last -/// `count` elements of `iter`, an [`Iterator`] over [`Result`] -/// instances, or all but the first `count` elements of `iter`. If -/// `beginning` is `true`, then all but the first `count` elements are -/// returned. -/// -/// # Panics -/// -/// If any element of `iter` is an [`Err`], then this function panics. -fn unbounded_tail_collect( - mut iter: impl Iterator>, - count: u64, - beginning: bool, -) -> UResult> -where - E: fmt::Debug, -{ - if beginning { - // GNU `tail` seems to index bytes and lines starting at 1, not - // at 0. It seems to treat `+0` and `+1` as the same thing. - let i = count.max(1) - 1; - skip_u64(&mut iter, i); - Ok(iter.map(|r| r.unwrap()).collect()) - } else { - let count: usize = count - .try_into() - .map_err(|_| USimpleError::new(1, "Insufficient addressable memory"))?; - Ok(RingBuffer::from_iter(iter.map(|r| r.unwrap()), count).data) - } -} - fn unbounded_tail(reader: &mut BufReader, settings: &Settings) -> UResult<()> { - // Read through each line/char and store them in a ringbuffer that always - // contains count lines/chars. When reaching the end of file, output the - // data in the ringbuf. - match settings.mode { - FilterMode::Lines(count, sep) => { - let mut stdout = stdout(); - for line in unbounded_tail_collect(lines(reader, sep), count, settings.beginning)? { - stdout - .write_all(&line) - .map_err_context(|| String::from("IO error"))?; - } + let stdout = stdout(); + let mut writer = BufWriter::new(stdout.lock()); + match (&settings.mode, settings.beginning) { + (FilterMode::Lines(count, sep), false) => { + let mut chunks = chunks::LinesChunkBuffer::new(*sep, *count); + chunks.fill(reader)?; + chunks.print(writer)?; } - FilterMode::Bytes(count) => { - for byte in unbounded_tail_collect(reader.bytes(), count, settings.beginning)? { - if let Err(err) = stdout().write(&[byte]) { - return Err(USimpleError::new(1, err.to_string())); + (FilterMode::Lines(count, sep), true) => { + let mut num_skip = (*count).max(1) - 1; + let mut chunk = chunks::LinesChunk::new(*sep); + while chunk.fill(reader)?.is_some() { + let lines = chunk.get_lines() as u64; + if lines < num_skip { + num_skip -= lines; + } else { + break; } } + if chunk.has_data() { + chunk.print_lines(&mut writer, num_skip as usize)?; + io::copy(reader, &mut writer)?; + } + } + (FilterMode::Bytes(count), false) => { + let mut chunks = chunks::BytesChunkBuffer::new(*count); + chunks.fill(reader)?; + chunks.print(writer)?; + } + (FilterMode::Bytes(count), true) => { + let mut num_skip = (*count).max(1) - 1; + let mut chunk = chunks::BytesChunk::new(); + loop { + if let Some(bytes) = chunk.fill(reader)? { + let bytes: u64 = bytes as u64; + match bytes.cmp(&num_skip) { + Ordering::Less => num_skip -= bytes, + Ordering::Equal => { + break; + } + Ordering::Greater => { + writer.write_all(chunk.get_buffer_with(num_skip as usize))?; + break; + } + } + } else { + return Ok(()); + } + } + + io::copy(reader, &mut writer)?; } } Ok(()) diff --git a/tests/by-util/test_tail.rs b/tests/by-util/test_tail.rs index 1a48cebfe..442c07979 100644 --- a/tests/by-util/test_tail.rs +++ b/tests/by-util/test_tail.rs @@ -3,7 +3,7 @@ // * For the full copyright and license information, please view the LICENSE // * file that was distributed with this source code. -// spell-checker:ignore (ToDO) abcdefghijklmnopqrstuvwxyz efghijklmnopqrstuvwxyz vwxyz emptyfile file siette ocho nueve diez +// spell-checker:ignore (ToDO) abcdefghijklmnopqrstuvwxyz efghijklmnopqrstuvwxyz vwxyz emptyfile file siette ocho nueve diez MULT // spell-checker:ignore (libs) kqueue // spell-checker:ignore (jargon) tailable untailable @@ -1090,18 +1090,6 @@ fn test_invalid_num() { .fails() .stderr_str() .starts_with("tail: invalid number of lines: '1Y': Value too large for defined data type"); - #[cfg(target_pointer_width = "32")] - { - let sizes = ["1000G", "10T"]; - for size in &sizes { - new_ucmd!() - .args(&["-c", size]) - .fails() - .code_is(1) - .stderr_str() - .starts_with("tail: Insufficient addressable memory"); - } - } new_ucmd!() .args(&["-c", "-³"]) .fails() @@ -2484,6 +2472,725 @@ fn test_illegal_seek() { assert_eq!(p.wait().unwrap().code().unwrap(), 1); } +#[cfg(all(not(target_os = "android"), not(target_os = "windows")))] // FIXME: See https://github.com/uutils/coreutils/issues/3881 +mod pipe_tests { + use super::*; + use crate::common::random::*; + use rand::distributions::Alphanumeric; + use tail::chunks::BUFFER_SIZE as CHUNK_BUFFER_SIZE; + + #[test] + fn test_pipe_when_lines_option_value_is_higher_than_contained_lines() { + let test_string = "a\nb\n"; + new_ucmd!() + .args(&["-n", "3"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(test_string); + + new_ucmd!() + .args(&["-n", "4"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(test_string); + + new_ucmd!() + .args(&["-n", "999"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(test_string); + + new_ucmd!() + .args(&["-n", "+3"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .no_stdout() + .no_stderr(); + + new_ucmd!() + .args(&["-n", "+4"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .no_stdout() + .no_stderr(); + + new_ucmd!() + .args(&["-n", "+999"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .no_stdout() + .no_stderr(); + } + + #[test] + fn test_pipe_when_negative_lines_option_given_no_newline_at_eof() { + let test_string = "a\nb"; + + new_ucmd!() + .args(&["-n", "0"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .no_stdout() + .no_stderr(); + + new_ucmd!() + .args(&["-n", "1"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("b"); + + new_ucmd!() + .args(&["-n", "2"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("a\nb"); + } + + #[test] + fn test_pipe_when_positive_lines_option_given_no_newline_at_eof() { + let test_string = "a\nb"; + + new_ucmd!() + .args(&["-n", "+0"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("a\nb"); + + new_ucmd!() + .args(&["-n", "+1"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("a\nb"); + + new_ucmd!() + .args(&["-n", "+2"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("b"); + } + + #[test] + fn test_pipe_when_lines_option_given_multibyte_utf8_characters() { + // the test string consists of from left to right a 4-byte,3-byte,2-byte,1-byte utf-8 character + let test_string = "𝅘𝅥𝅮\n⏻\nƒ\na"; + + new_ucmd!() + .args(&["-n", "+0"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(test_string); + + new_ucmd!() + .args(&["-n", "+2"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("⏻\nƒ\na"); + + new_ucmd!() + .args(&["-n", "+3"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("ƒ\na"); + + new_ucmd!() + .args(&["-n", "+4"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("a"); + + new_ucmd!() + .args(&["-n", "+5"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .no_stdout() + .no_stderr(); + + new_ucmd!() + .args(&["-n", "-4"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(test_string); + + new_ucmd!() + .args(&["-n", "-3"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("⏻\nƒ\na"); + + new_ucmd!() + .args(&["-n", "-2"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("ƒ\na"); + + new_ucmd!() + .args(&["-n", "-1"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("a"); + + new_ucmd!() + .args(&["-n", "-0"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .no_stdout() + .no_stderr(); + } + + #[test] + fn test_pipe_when_lines_option_given_input_size_is_equal_to_buffer_size_no_newline_at_eof() { + let total_lines = 1; + let random_string = RandomString::generate_with_delimiter( + Alphanumeric, + b'\n', + total_lines, + false, + CHUNK_BUFFER_SIZE, + ); + let random_string = random_string.as_str(); + let lines = random_string.split_inclusive('\n'); + + let expected = lines.clone().skip(1).collect::(); + new_ucmd!() + .args(&["-n", "+2"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(expected); + + let expected = lines.clone().skip(1).collect::(); + new_ucmd!() + .args(&["-n", "-1"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(expected); + } + + #[test] + fn test_pipe_when_lines_option_given_input_size_is_equal_to_buffer_size() { + let total_lines = 100; + let random_string = RandomString::generate_with_delimiter( + Alphanumeric, + b'\n', + total_lines, + true, + CHUNK_BUFFER_SIZE, + ); + let random_string = random_string.as_str(); + let lines = random_string.split_inclusive('\n'); + + new_ucmd!() + .args(&["-n", "+0"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(random_string); + + let expected = lines.clone().skip(1).collect::(); + new_ucmd!() + .args(&["-n", "+2"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(expected); + + new_ucmd!() + .args(&["-n", "-0"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .no_stdout() + .no_stderr(); + + let expected = lines.clone().skip(total_lines - 1).collect::(); + new_ucmd!() + .args(&["-n", "-1"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(expected); + + let expected = lines.clone().skip(1).collect::(); + new_ucmd!() + .args(&["-n", "-99"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(expected); + + new_ucmd!() + .args(&["-n", "-100"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(random_string); + } + + #[test] + fn test_pipe_when_lines_option_given_input_size_is_one_byte_greater_than_buffer_size() { + let total_lines = 100; + let random_string = RandomString::generate_with_delimiter( + Alphanumeric, + b'\n', + total_lines, + true, + CHUNK_BUFFER_SIZE + 1, + ); + let random_string = random_string.as_str(); + let lines = random_string.split_inclusive('\n'); + + new_ucmd!() + .args(&["-n", "+0"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(random_string); + + let expected = lines.clone().skip(total_lines - 1).collect::(); + new_ucmd!() + .args(&["-n", "-1"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(expected); + + let expected = lines.clone().skip(1).collect::(); + new_ucmd!() + .args(&["-n", "+2"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(expected); + + let expected = lines.clone().skip(1).collect::(); + new_ucmd!() + .args(&["-n", "-99"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(expected); + } + + #[test] + fn test_pipe_when_lines_option_given_input_size_has_multiple_size_of_buffer_size() { + let total_lines = 100; + let random_string = RandomString::generate_with_delimiter( + Alphanumeric, + b'\n', + total_lines, + true, + CHUNK_BUFFER_SIZE * 3 + 1, + ); + let random_string = random_string.as_str(); + let lines = random_string.split_inclusive('\n'); + + new_ucmd!() + .args(&["-n", "+0"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(random_string); + + let expected = lines.clone().skip(1).collect::(); + new_ucmd!() + .args(&["-n", "+2"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(expected); + + new_ucmd!() + .args(&["-n", "-0"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .no_stdout() + .no_stderr(); + + let expected = lines.clone().skip(total_lines - 1).collect::(); + new_ucmd!() + .args(&["-n", "-1"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(expected); + + let expected = lines.clone().skip(1).collect::(); + new_ucmd!() + .args(&["-n", "-99"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(expected); + + new_ucmd!() + .args(&["-n", "-100"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(random_string); + } + + #[test] + fn test_pipe_when_bytes_option_value_is_higher_than_contained_bytes() { + let test_string = "a\nb"; + new_ucmd!() + .args(&["-c", "4"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(test_string); + + new_ucmd!() + .args(&["-c", "5"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(test_string); + + new_ucmd!() + .args(&["-c", "999"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(test_string); + + new_ucmd!() + .args(&["-c", "+4"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .no_stdout() + .no_stderr(); + + new_ucmd!() + .args(&["-c", "+5"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .no_stdout() + .no_stderr(); + + new_ucmd!() + .args(&["-c", "+999"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .no_stdout() + .no_stderr(); + } + + #[test] + fn test_pipe_when_bytes_option_given_multibyte_utf8_characters() { + // the test string consists of from left to right a 4-byte,3-byte,2-byte,1-byte utf-8 character + let test_string = "𝅘𝅥𝅮⏻ƒa"; + + new_ucmd!() + .args(&["-c", "+0"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(test_string); + + new_ucmd!() + .args(&["-c", "+2"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(&test_string.as_bytes()[1..]); + + new_ucmd!() + .args(&["-c", "+5"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("⏻ƒa"); + + new_ucmd!() + .args(&["-c", "+8"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("ƒa"); + + new_ucmd!() + .args(&["-c", "+10"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("a"); + + new_ucmd!() + .args(&["-c", "+11"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .no_stdout() + .no_stderr(); + + new_ucmd!() + .args(&["-c", "-1"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("a"); + + new_ucmd!() + .args(&["-c", "-2"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(&"ƒa".as_bytes()[1..]); + + new_ucmd!() + .args(&["-c", "-3"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("ƒa"); + + new_ucmd!() + .args(&["-c", "-6"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only("⏻ƒa"); + + new_ucmd!() + .args(&["-c", "-10"]) + .pipe_in(test_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(test_string); + } + + #[test] + fn test_pipe_when_bytes_option_given_input_size_is_equal_to_buffer_size() { + let random_string = RandomString::generate(AlphanumericNewline, CHUNK_BUFFER_SIZE); + let random_string = random_string.as_str(); + + new_ucmd!() + .args(&["-c", "+0"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(random_string); + + let expected = &random_string.as_bytes()[1..]; + new_ucmd!() + .args(&["-c", "+2"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(expected); + + new_ucmd!() + .args(&["-c", "-0"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .no_stdout() + .no_stderr(); + + let expected = &random_string.as_bytes()[1..]; + new_ucmd!() + .args(&["-c", "-8191"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(expected); + + new_ucmd!() + .args(&["-c", "-8192"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(random_string); + + new_ucmd!() + .args(&["-c", "-8193"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(random_string); + + let expected = &random_string.as_bytes()[CHUNK_BUFFER_SIZE - 1..]; + new_ucmd!() + .args(&["-c", "-1"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(expected); + } + + #[test] + fn test_pipe_when_bytes_option_given_input_size_is_one_byte_greater_than_buffer_size() { + let random_string = RandomString::generate(AlphanumericNewline, CHUNK_BUFFER_SIZE + 1); + let random_string = random_string.as_str(); + + new_ucmd!() + .args(&["-c", "+0"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(random_string); + + let expected = &random_string.as_bytes()[1..]; + new_ucmd!() + .args(&["-c", "+2"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(expected); + + new_ucmd!() + .args(&["-c", "-0"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .no_stdout() + .no_stderr(); + + let expected = &random_string.as_bytes()[CHUNK_BUFFER_SIZE..]; + new_ucmd!() + .args(&["-c", "-1"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(expected); + + let expected = &random_string.as_bytes()[1..]; + new_ucmd!() + .args(&["-c", "-8192"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(expected); + + new_ucmd!() + .args(&["-c", "-8193"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(random_string); + } + + #[test] + fn test_pipe_when_bytes_option_given_input_size_has_multiple_size_of_buffer_size() { + let random_string = RandomString::generate(AlphanumericNewline, CHUNK_BUFFER_SIZE * 3); + let random_string = random_string.as_str(); + + new_ucmd!() + .args(&["-c", "+0"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(random_string); + + new_ucmd!() + .args(&["-c", "-0"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .no_stdout() + .no_stderr(); + + let expected = &random_string.as_bytes()[8192..]; + new_ucmd!() + .args(&["-c", "+8193"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(expected); + + let expected = &random_string.as_bytes()[8193..]; + new_ucmd!() + .args(&["-c", "+8194"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(expected); + + let expected = &random_string.as_bytes()[16384..]; + new_ucmd!() + .args(&["-c", "+16385"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(expected); + + let expected = &random_string.as_bytes()[16385..]; + new_ucmd!() + .args(&["-c", "+16386"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(expected); + + let expected = &random_string.as_bytes()[16384..]; + new_ucmd!() + .args(&["-c", "-8192"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(expected); + + let expected = &random_string.as_bytes()[16383..]; + new_ucmd!() + .args(&["-c", "-8193"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(expected); + + let expected = &random_string.as_bytes()[8192..]; + new_ucmd!() + .args(&["-c", "-16384"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(expected); + + let expected = &random_string.as_bytes()[8191..]; + new_ucmd!() + .args(&["-c", "-16385"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only_bytes(expected); + + new_ucmd!() + .args(&["-c", "-24576"]) + .pipe_in(random_string) + .ignore_stdin_write_error() + .succeeds() + .stdout_only(random_string); + } +} + #[test] fn test_seek_bytes_backward_outside_file() { new_ucmd!() diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 3fcd90441..f73cd42af 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -1,3 +1,4 @@ #[macro_use] pub mod macros; +pub mod random; pub mod util; diff --git a/tests/common/random.rs b/tests/common/random.rs new file mode 100644 index 000000000..338aeab50 --- /dev/null +++ b/tests/common/random.rs @@ -0,0 +1,314 @@ +// * This file is part of the uutils coreutils package. +// * +// * For the full copyright and license information, please view the LICENSE +// * file that was distributed with this source code. + +use rand::distributions::{Distribution, Uniform}; +use rand::{thread_rng, Rng}; + +/// Samples alphanumeric characters `[A-Za-z0-9]` including newline `\n` +/// +/// # Examples +/// +/// ```rust,ignore +/// use rand::{Rng, thread_rng}; +/// +/// let vec = thread_rng() +/// .sample_iter(AlphanumericNewline) +/// .take(10) +/// .collect::>(); +/// println!("Random chars: {}", String::from_utf8(vec).unwrap()); +/// ``` +#[derive(Clone, Copy, Debug)] +pub struct AlphanumericNewline; + +impl AlphanumericNewline { + /// The charset to act upon + const CHARSET: &'static [u8] = + b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789\n"; + + /// Generate a random byte from [`Self::CHARSET`] and return it as `u8`. + /// + /// # Arguments + /// + /// * `rng`: A [`rand::Rng`] + /// + /// returns: u8 + fn random(rng: &mut R) -> u8 + where + R: Rng + ?Sized, + { + let idx = rng.gen_range(0..Self::CHARSET.len()); + Self::CHARSET[idx] + } +} + +impl Distribution for AlphanumericNewline { + fn sample(&self, rng: &mut R) -> u8 { + Self::random(rng) + } +} + +/// Generate a random string from a [`Distribution`] +/// +/// # Examples +/// +/// ```rust,ignore +/// use crate::common::random::{AlphanumericNewline, RandomString}; +/// use rand::distributions::Alphanumeric; +/// +/// // generates a 100 byte string with characters from AlphanumericNewline +/// let random_string = RandomString::generate(&AlphanumericNewline, 100); +/// assert_eq!(100, random_string.len()); +/// +/// // generates a 100 byte string with 10 newline characters not ending with a newline +/// let string = RandomString::generate_with_delimiter(&Alphanumeric, b'\n', 10, false, 100); +/// assert_eq!(100, random_string.len()); +/// ``` +pub struct RandomString; + +impl RandomString { + /// Generate a random string from the given [`Distribution`] with the given `length` in bytes. + /// + /// # Arguments + /// + /// * `dist`: A u8 [`Distribution`] + /// * `length`: the length of the resulting string in bytes + /// + /// returns: String + pub fn generate(dist: D, length: usize) -> String + where + D: Distribution, + { + thread_rng() + .sample_iter(dist) + .take(length) + .map(|b| b as char) + .collect() + } + + /// Generate a random string from the [`Distribution`] with the given `length` in bytes. The + /// function takes a `delimiter`, which is randomly distributed in the string, such that exactly + /// `num_delimiter` amount of `delimiter`s occur. If `end_with_delimiter` is set, then the + /// string ends with the delimiter, else the string does not end with the delimiter. + /// + /// # Arguments + /// + /// * `dist`: A `u8` [`Distribution`] + /// * `delimiter`: A `u8` delimiter, which does not need to be included in the `Distribution` + /// * `num_delimiter`: The number of `delimiter`s contained in the resulting string + /// * `end_with_delimiter`: If the string shall end with the given delimiter + /// * `length`: the length of the resulting string in bytes + /// + /// returns: String + /// + /// # Examples + /// + /// ```rust,ignore + /// use crate::common::random::{AlphanumericNewline, RandomString}; + /// + /// // generates a 100 byte string with 10 '\0' byte characters not ending with a '\0' byte + /// let string = RandomString::generate_with_delimiter(&AlphanumericNewline, 0, 10, false, 100); + /// assert_eq!(100, random_string.len()); + /// assert_eq!( + /// 10, + /// random_string.as_bytes().iter().filter(|p| **p == 0).count() + /// ); + /// assert!(!random_string.as_bytes().ends_with(&[0])); + /// ``` + pub fn generate_with_delimiter( + dist: D, + delimiter: u8, + num_delimiter: usize, + end_with_delimiter: bool, + length: usize, + ) -> String + where + D: Distribution, + { + if length == 0 { + return String::from(""); + } else if length == 1 { + return if num_delimiter > 0 { + String::from(delimiter as char) + } else { + String::from(thread_rng().sample(&dist) as char) + }; + } + + let samples = length - 1; + let mut result: Vec = thread_rng().sample_iter(&dist).take(samples).collect(); + + if num_delimiter == 0 { + result.push(thread_rng().sample(&dist)); + return String::from_utf8(result).unwrap(); + } + + let num_delimiter = if end_with_delimiter { + num_delimiter - 1 + } else { + num_delimiter + }; + + let between = Uniform::new(0, samples); + for _ in 0..num_delimiter { + let mut pos = between.sample(&mut thread_rng()); + let turn = pos; + while result[pos] == delimiter { + pos += 1; + if pos >= samples { + pos = 0; + } + if pos == turn { + break; + } + } + result[pos] = delimiter; + } + + if end_with_delimiter { + result.push(delimiter); + } else { + result.push(thread_rng().sample(&dist)); + } + + String::from_utf8(result).unwrap() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::distributions::Alphanumeric; + + #[test] + fn test_random_string_generate() { + let random_string = RandomString::generate(&AlphanumericNewline, 0); + assert_eq!(0, random_string.len()); + + let random_string = RandomString::generate(&AlphanumericNewline, 1); + assert_eq!(1, random_string.len()); + + let random_string = RandomString::generate(&AlphanumericNewline, 100); + assert_eq!(100, random_string.len()); + } + + #[test] + fn test_random_string_generate_with_delimiter_when_length_is_zero() { + let random_string = RandomString::generate_with_delimiter(&Alphanumeric, 0, 0, false, 0); + assert_eq!(0, random_string.len()); + } + + #[test] + fn test_random_string_generate_with_delimiter_when_num_delimiter_is_greater_than_length() { + let random_string = RandomString::generate_with_delimiter(&Alphanumeric, 0, 2, false, 1); + assert_eq!(1, random_string.len()); + assert!(random_string.as_bytes().contains(&0)); + assert!(random_string.as_bytes().ends_with(&[0])); + } + + #[test] + fn test_random_string_generate_with_delimiter_should_end_with_delimiter() { + let random_string = RandomString::generate_with_delimiter(&Alphanumeric, 0, 1, true, 1); + assert_eq!(1, random_string.len()); + assert_eq!( + 1, + random_string.as_bytes().iter().filter(|p| **p == 0).count() + ); + assert!(random_string.as_bytes().ends_with(&[0])); + + let random_string = RandomString::generate_with_delimiter(&Alphanumeric, 0, 1, false, 1); + assert_eq!(1, random_string.len()); + assert_eq!( + 1, + random_string.as_bytes().iter().filter(|p| **p == 0).count() + ); + assert!(random_string.as_bytes().ends_with(&[0])); + + let random_string = RandomString::generate_with_delimiter(&Alphanumeric, 0, 1, true, 2); + assert_eq!(2, random_string.len()); + assert_eq!( + 1, + random_string.as_bytes().iter().filter(|p| **p == 0).count() + ); + assert!(random_string.as_bytes().ends_with(&[0])); + + let random_string = RandomString::generate_with_delimiter(&Alphanumeric, 0, 2, true, 2); + assert_eq!(2, random_string.len()); + assert_eq!( + 2, + random_string.as_bytes().iter().filter(|p| **p == 0).count() + ); + assert!(random_string.as_bytes().ends_with(&[0])); + + let random_string = RandomString::generate_with_delimiter(&Alphanumeric, 0, 1, true, 3); + assert_eq!(3, random_string.len()); + assert_eq!( + 1, + random_string.as_bytes().iter().filter(|p| **p == 0).count() + ); + assert!(random_string.as_bytes().ends_with(&[0])); + } + + #[test] + fn test_random_string_generate_with_delimiter_should_not_end_with_delimiter() { + let random_string = RandomString::generate_with_delimiter(&Alphanumeric, 0, 0, false, 1); + assert_eq!(1, random_string.len()); + assert_eq!( + 0, + random_string.as_bytes().iter().filter(|p| **p == 0).count() + ); + + let random_string = RandomString::generate_with_delimiter(&Alphanumeric, 0, 0, true, 1); + assert_eq!(1, random_string.len()); + assert_eq!( + 0, + random_string.as_bytes().iter().filter(|p| **p == 0).count() + ); + + let random_string = RandomString::generate_with_delimiter(&Alphanumeric, 0, 1, false, 2); + assert_eq!(2, random_string.len()); + assert_eq!( + 1, + random_string.as_bytes().iter().filter(|p| **p == 0).count() + ); + assert!(!random_string.as_bytes().ends_with(&[0])); + + let random_string = RandomString::generate_with_delimiter(&Alphanumeric, 0, 1, false, 3); + assert_eq!(3, random_string.len()); + assert_eq!( + 1, + random_string.as_bytes().iter().filter(|p| **p == 0).count() + ); + assert!(!random_string.as_bytes().ends_with(&[0])); + + let random_string = RandomString::generate_with_delimiter(&Alphanumeric, 0, 2, false, 3); + assert_eq!(3, random_string.len()); + assert_eq!( + 2, + random_string.as_bytes().iter().filter(|p| **p == 0).count() + ); + assert!(!random_string.as_bytes().ends_with(&[0])); + } + + #[test] + fn test_generate_with_delimiter_with_greater_length() { + let random_string = + RandomString::generate_with_delimiter(&Alphanumeric, 0, 100, false, 1000); + assert_eq!(1000, random_string.len()); + assert_eq!( + 100, + random_string.as_bytes().iter().filter(|p| **p == 0).count() + ); + assert!(!random_string.as_bytes().ends_with(&[0])); + + let random_string = + RandomString::generate_with_delimiter(&Alphanumeric, 0, 100, true, 1000); + assert_eq!(1000, random_string.len()); + assert_eq!( + 100, + random_string.as_bytes().iter().filter(|p| **p == 0).count() + ); + assert!(random_string.as_bytes().ends_with(&[0])); + } +}