diff --git a/Cargo.lock b/Cargo.lock index 3633928c6..0cffb8d73 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2507,6 +2507,7 @@ name = "uu_head" version = "0.0.7" dependencies = [ "clap", + "memchr 2.4.0", "uucore", "uucore_procs", ] diff --git a/src/uu/head/BENCHMARKING.md b/src/uu/head/BENCHMARKING.md new file mode 100644 index 000000000..49574eb79 --- /dev/null +++ b/src/uu/head/BENCHMARKING.md @@ -0,0 +1,41 @@ +# Benchmarking to measure performance + +To compare the performance of the `uutils` version of `head` with the +GNU version of `head`, you can use a benchmarking tool like +[hyperfine][0]. On Ubuntu 18.04 or later, you can install `hyperfine` by +running + + sudo apt-get install hyperfine + +Next, build the `head` binary under the release profile: + + cargo build --release -p uu_head + +Now, get a text file to test `head` on. I used the *Complete Works of +William Shakespeare*, which is in the public domain in the United States +and most other parts of the world. + + wget -O shakespeare.txt https://www.gutenberg.org/files/100/100-0.txt + +This particular file has about 170,000 lines, each of which is no longer +than 96 characters: + + $ wc -lL shakespeare.txt + 170592 96 shakespeare.txt + +You could use files of different shapes and sizes to test the +performance of `head` in different situations. For a larger file, you +could download a [database dump of Wikidata][1] or some related files +that the Wikimedia project provides. For example, [this file][2] +contains about 130 million lines. + +Finally, you can compare the performance of the two versions of `head` +by running, for example, + + hyperfine \ + "head -n 100000 shakespeare.txt" \ + "target/release/head -n 100000 shakespeare.txt" + +[0]: https://github.com/sharkdp/hyperfine +[1]: https://www.wikidata.org/wiki/Wikidata:Database_download +[2]: https://dumps.wikimedia.org/wikidatawiki/20211001/wikidatawiki-20211001-pages-logging.xml.gz diff --git a/src/uu/head/Cargo.toml b/src/uu/head/Cargo.toml index 4fa4c0c81..3a615812b 100644 --- a/src/uu/head/Cargo.toml +++ b/src/uu/head/Cargo.toml @@ -16,6 +16,7 @@ path = "src/head.rs" [dependencies] clap = { version = "2.33", features = ["wrap_help"] } +memchr = "2" uucore = { version=">=0.0.9", package="uucore", path="../../uucore", features=["ringbuffer"] } uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" } diff --git a/src/uu/head/src/head.rs b/src/uu/head/src/head.rs index c33ec693b..e3325d084 100644 --- a/src/uu/head/src/head.rs +++ b/src/uu/head/src/head.rs @@ -3,18 +3,21 @@ // * For the full copyright and license information, please view the LICENSE // * file that was distributed with this source code. -// spell-checker:ignore (vars) zlines +// spell-checker:ignore (vars) zlines BUFWRITER use clap::{crate_version, App, Arg}; use std::convert::TryFrom; use std::ffi::OsString; -use std::io::{self, ErrorKind, Read, Seek, SeekFrom, Write}; +use std::io::{self, BufWriter, ErrorKind, Read, Seek, SeekFrom, Write}; use uucore::display::Quotable; use uucore::error::{UResult, USimpleError}; use uucore::show_error_custom_description; const BUF_SIZE: usize = 65536; +/// The capacity in bytes for buffered writers. +const BUFWRITER_CAPACITY: usize = 16_384; // 16 kilobytes + const ABOUT: &str = "\ Print the first 10 lines of each FILE to standard output.\n\ With more than one FILE, precede each with a header giving the file name.\n\ @@ -34,10 +37,10 @@ mod options { } mod lines; mod parse; -mod split; mod take; use lines::zlines; use take::take_all_but; +use take::take_lines; pub fn uu_app() -> App<'static, 'static> { App::new(uucore::util_name()) @@ -208,26 +211,18 @@ where } fn read_n_lines(input: &mut impl std::io::BufRead, n: usize, zero: bool) -> std::io::Result<()> { - if n == 0 { - return Ok(()); - } + // Read the first `n` lines from the `input` reader. + let separator = if zero { b'\0' } else { b'\n' }; + let mut reader = take_lines(input, n, separator); + + // Write those bytes to `stdout`. let stdout = std::io::stdout(); - let mut stdout = stdout.lock(); - let mut lines = 0usize; - split::walk_lines(input, zero, |e| match e { - split::Event::Data(dat) => { - stdout.write_all(dat)?; - Ok(true) - } - split::Event::Line => { - lines += 1; - if lines == n { - Ok(false) - } else { - Ok(true) - } - } - }) + let stdout = stdout.lock(); + let mut writer = BufWriter::with_capacity(BUFWRITER_CAPACITY, stdout); + + io::copy(&mut reader, &mut writer)?; + + Ok(()) } fn read_but_last_n_bytes(input: &mut impl std::io::BufRead, n: usize) -> std::io::Result<()> { diff --git a/src/uu/head/src/split.rs b/src/uu/head/src/split.rs deleted file mode 100644 index 9e9a0c685..000000000 --- a/src/uu/head/src/split.rs +++ /dev/null @@ -1,60 +0,0 @@ -#[derive(Debug)] -pub enum Event<'a> { - Data(&'a [u8]), - Line, -} -/// Loops over the lines read from a BufRead. -/// # Arguments -/// * `input` the ReadBuf to read from -/// * `zero` whether to use 0u8 as a line delimiter -/// * `on_event` a closure receiving some bytes read in a slice, or -/// event signalling a line was just read. -/// this is guaranteed to be signalled *directly* after the -/// slice containing the (CR on win)LF / 0 is passed -/// -/// Return whether to continue -pub fn walk_lines( - input: &mut impl std::io::BufRead, - zero: bool, - mut on_event: F, -) -> std::io::Result<()> -where - F: FnMut(Event) -> std::io::Result, -{ - let mut buffer = [0u8; super::BUF_SIZE]; - loop { - let read = loop { - match input.read(&mut buffer) { - Ok(n) => break n, - Err(e) => match e.kind() { - std::io::ErrorKind::Interrupted => {} - _ => return Err(e), - }, - } - }; - if read == 0 { - return Ok(()); - } - let mut base = 0usize; - for (i, byte) in buffer[..read].iter().enumerate() { - match byte { - b'\n' if !zero => { - on_event(Event::Data(&buffer[base..=i]))?; - base = i + 1; - if !on_event(Event::Line)? { - return Ok(()); - } - } - 0u8 if zero => { - on_event(Event::Data(&buffer[base..=i]))?; - base = i + 1; - if !on_event(Event::Line)? { - return Ok(()); - } - } - _ => {} - } - } - on_event(Event::Data(&buffer[base..read]))?; - } -} diff --git a/src/uu/head/src/take.rs b/src/uu/head/src/take.rs index 94fa012be..5f4c29b65 100644 --- a/src/uu/head/src/take.rs +++ b/src/uu/head/src/take.rs @@ -1,4 +1,8 @@ //! Take all but the last elements of an iterator. +use std::io::Read; + +use memchr::memchr_iter; + use uucore::ringbuffer::RingBuffer; /// Create an iterator over all but the last `n` elements of `iter`. @@ -58,10 +62,63 @@ where } } +/// Like `std::io::Take`, but for lines instead of bytes. +/// +/// This struct is generally created by calling [`take_lines`] on a +/// reader. Please see the documentation of [`take`] for more +/// details. +pub struct TakeLines { + inner: T, + limit: usize, + separator: u8, +} + +impl Read for TakeLines { + /// Read bytes from a buffer up to the requested number of lines. + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if self.limit == 0 { + return Ok(0); + } + match self.inner.read(buf) { + Ok(0) => Ok(0), + Ok(n) => { + for i in memchr_iter(self.separator, &buf[..n]) { + self.limit -= 1; + if self.limit == 0 { + return Ok(i + 1); + } + } + Ok(n) + } + Err(e) => Err(e), + } + } +} + +/// Create an adaptor that will read at most `limit` lines from a given reader. +/// +/// This function returns a new instance of `Read` that will read at +/// most `limit` lines, after which it will always return EOF +/// (`Ok(0)`). +/// +/// The `separator` defines the character to interpret as the line +/// ending. For the usual notion of "line", set this to `b'\n'`. +pub fn take_lines(reader: R, limit: usize, separator: u8) -> TakeLines { + TakeLines { + inner: reader, + limit, + separator, + } +} + #[cfg(test)] mod tests { + use std::io::BufRead; + use std::io::BufReader; + use crate::take::take_all_but; + use crate::take::take_lines; #[test] fn test_fewer_elements() { @@ -90,4 +147,33 @@ mod tests { assert_eq!(Some(&2), iter.next()); assert_eq!(None, iter.next()); } + + #[test] + fn test_zero_lines() { + let input_reader = std::io::Cursor::new("a\nb\nc\n"); + let output_reader = BufReader::new(take_lines(input_reader, 0, b'\n')); + let mut iter = output_reader.lines().map(|l| l.unwrap()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_fewer_lines() { + let input_reader = std::io::Cursor::new("a\nb\nc\n"); + let output_reader = BufReader::new(take_lines(input_reader, 2, b'\n')); + let mut iter = output_reader.lines().map(|l| l.unwrap()); + assert_eq!(Some(String::from("a")), iter.next()); + assert_eq!(Some(String::from("b")), iter.next()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_more_lines() { + let input_reader = std::io::Cursor::new("a\nb\nc\n"); + let output_reader = BufReader::new(take_lines(input_reader, 4, b'\n')); + let mut iter = output_reader.lines().map(|l| l.unwrap()); + assert_eq!(Some(String::from("a")), iter.next()); + assert_eq!(Some(String::from("b")), iter.next()); + assert_eq!(Some(String::from("c")), iter.next()); + assert_eq!(None, iter.next()); + } }