Merge pull request #2712 from jfinkels/head-take-lines-reader

head: use std::io::copy() with TakeLines reader
2025-07-27 11:07:44 +00:00 · 2021-10-23 17:53:44 +02:00 · 2021-10-23 17:53:44 +02:00 · 811698b658
commit 811698b658
parent 803c05cb4a 858b0a9e9f
6 changed files with 146 additions and 82 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2507,6 +2507,7 @@ name = "uu_head"
 version = "0.0.7"
 dependencies = [
 "clap",
 "memchr 2.4.0",
 "uucore",
 "uucore_procs",
 ]
--- a/src/uu/head/BENCHMARKING.md
+++ b/src/uu/head/BENCHMARKING.md
@ -0,0 +1,41 @@
 # Benchmarking to measure performance
 To compare the performance of the `uutils` version of `head` with the
 GNU version of `head`, you can use a benchmarking tool like
 [hyperfine][0]. On Ubuntu 18.04 or later, you can install `hyperfine` by
 running
    sudo apt-get install hyperfine
 Next, build the `head` binary under the release profile:
    cargo build --release -p uu_head
 Now, get a text file to test `head` on. I used the *Complete Works of
 William Shakespeare*, which is in the public domain in the United States
 and most other parts of the world.
    wget -O shakespeare.txt https://www.gutenberg.org/files/100/100-0.txt
 This particular file has about 170,000 lines, each of which is no longer
 than 96 characters:
    $ wc -lL shakespeare.txt 
    170592      96 shakespeare.txt
 You could use files of different shapes and sizes to test the
 performance of `head` in different situations. For a larger file, you
 could download a [database dump of Wikidata][1] or some related files
 that the Wikimedia project provides. For example, [this file][2]
 contains about 130 million lines.
 Finally, you can compare the performance of the two versions of `head`
 by running, for example,
    hyperfine \
        "head -n 100000 shakespeare.txt" \
        "target/release/head -n 100000 shakespeare.txt"
 [0]: https://github.com/sharkdp/hyperfine
 [1]: https://www.wikidata.org/wiki/Wikidata:Database_download
 [2]: https://dumps.wikimedia.org/wikidatawiki/20211001/wikidatawiki-20211001-pages-logging.xml.gz
--- a/src/uu/head/Cargo.toml
+++ b/src/uu/head/Cargo.toml
@ -16,6 +16,7 @@ path = "src/head.rs"
 [dependencies]
 clap = { version = "2.33", features = ["wrap_help"] }
 memchr = "2"
 uucore = { version=">=0.0.9", package="uucore", path="../../uucore", features=["ringbuffer"] }
 uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
--- a/src/uu/head/src/head.rs
+++ b/src/uu/head/src/head.rs
@ -3,18 +3,21 @@
 //  * For the full copyright and license information, please view the LICENSE
 //  * file that was distributed with this source code.
-// spell-checker:ignore (vars) zlines
+// spell-checker:ignore (vars) zlines BUFWRITER
 use clap::{crate_version, App, Arg};
 use std::convert::TryFrom;
 use std::ffi::OsString;
-use std::io::{self, ErrorKind, Read, Seek, SeekFrom, Write};
+use std::io::{self, BufWriter, ErrorKind, Read, Seek, SeekFrom, Write};
 use uucore::display::Quotable;
 use uucore::error::{UResult, USimpleError};
 use uucore::show_error_custom_description;
 const BUF_SIZE: usize = 65536;
 /// The capacity in bytes for buffered writers.
 const BUFWRITER_CAPACITY: usize = 16_384; // 16 kilobytes
 const ABOUT: &str = "\
                     Print the first 10 lines of each FILE to standard output.\n\
                     With more than one FILE, precede each with a header giving the file name.\n\
@ -34,10 +37,10 @@ mod options {
 }
 mod lines;
 mod parse;
 mod split;
 mod take;
 use lines::zlines;
 use take::take_all_but;
 use take::take_lines;
 pub fn uu_app() -> App<'static, 'static> {
    App::new(uucore::util_name())
@ -208,26 +211,18 @@ where
 }
 fn read_n_lines(input: &mut impl std::io::BufRead, n: usize, zero: bool) -> std::io::Result<()> {
-    if n == 0 {
+    // Read the first `n` lines from the `input` reader.
-        return Ok(());
+    let separator = if zero { b'\0' } else { b'\n' };
-    }
+    let mut reader = take_lines(input, n, separator);
    // Write those bytes to `stdout`.
    let stdout = std::io::stdout();
-    let mut stdout = stdout.lock();
+    let stdout = stdout.lock();
-    let mut lines = 0usize;
+    let mut writer = BufWriter::with_capacity(BUFWRITER_CAPACITY, stdout);
-    split::walk_lines(input, zero, |e| match e {
+
-        split::Event::Data(dat) => {
+    io::copy(&mut reader, &mut writer)?;
-            stdout.write_all(dat)?;
+
-            Ok(true)
+    Ok(())
        }
        split::Event::Line => {
            lines += 1;
            if lines == n {
                Ok(false)
            } else {
                Ok(true)
            }
        }
    })
 }
 fn read_but_last_n_bytes(input: &mut impl std::io::BufRead, n: usize) -> std::io::Result<()> {
--- a/src/uu/head/src/split.rs
+++ b/src/uu/head/src/split.rs
@ -1,60 +0,0 @@
 #[derive(Debug)]
 pub enum Event<'a> {
    Data(&'a [u8]),
    Line,
 }
 /// Loops over the lines read from a BufRead.
 /// # Arguments
 /// * `input` the ReadBuf to read from
 /// * `zero` whether to use 0u8 as a line delimiter
 /// * `on_event` a closure receiving some bytes read in a slice, or
 ///     event signalling a line was just read.
 ///     this is guaranteed to be signalled *directly* after the
 ///     slice containing the (CR on win)LF / 0 is passed
 ///
 ///     Return whether to continue
 pub fn walk_lines<F>(
    input: &mut impl std::io::BufRead,
    zero: bool,
    mut on_event: F,
 ) -> std::io::Result<()>
 where
    F: FnMut(Event) -> std::io::Result<bool>,
 {
    let mut buffer = [0u8; super::BUF_SIZE];
    loop {
        let read = loop {
            match input.read(&mut buffer) {
                Ok(n) => break n,
                Err(e) => match e.kind() {
                    std::io::ErrorKind::Interrupted => {}
                    _ => return Err(e),
                },
            }
        };
        if read == 0 {
            return Ok(());
        }
        let mut base = 0usize;
        for (i, byte) in buffer[..read].iter().enumerate() {
            match byte {
                b'\n' if !zero => {
                    on_event(Event::Data(&buffer[base..=i]))?;
                    base = i + 1;
                    if !on_event(Event::Line)? {
                        return Ok(());
                    }
                }
                0u8 if zero => {
                    on_event(Event::Data(&buffer[base..=i]))?;
                    base = i + 1;
                    if !on_event(Event::Line)? {
                        return Ok(());
                    }
                }
                _ => {}
            }
        }
        on_event(Event::Data(&buffer[base..read]))?;
    }
 }
--- a/src/uu/head/src/take.rs
+++ b/src/uu/head/src/take.rs
@ -1,4 +1,8 @@
 //! Take all but the last elements of an iterator.
 use std::io::Read;
 use memchr::memchr_iter;
 use uucore::ringbuffer::RingBuffer;
 /// Create an iterator over all but the last `n` elements of `iter`.
@ -58,10 +62,63 @@ where
    }
 }
 /// Like `std::io::Take`, but for lines instead of bytes.
 ///
 /// This struct is generally created by calling [`take_lines`] on a
 /// reader. Please see the documentation of [`take`] for more
 /// details.
 pub struct TakeLines<T> {
    inner: T,
    limit: usize,
    separator: u8,
 }
 impl<T: Read> Read for TakeLines<T> {
    /// Read bytes from a buffer up to the requested number of lines.
    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
        if self.limit == 0 {
            return Ok(0);
        }
        match self.inner.read(buf) {
            Ok(0) => Ok(0),
            Ok(n) => {
                for i in memchr_iter(self.separator, &buf[..n]) {
                    self.limit -= 1;
                    if self.limit == 0 {
                        return Ok(i + 1);
                    }
                }
                Ok(n)
            }
            Err(e) => Err(e),
        }
    }
 }
 /// Create an adaptor that will read at most `limit` lines from a given reader.
 ///
 /// This function returns a new instance of `Read` that will read at
 /// most `limit` lines, after which it will always return EOF
 /// (`Ok(0)`).
 ///
 /// The `separator` defines the character to interpret as the line
 /// ending. For the usual notion of "line", set this to `b'\n'`.
 pub fn take_lines<R>(reader: R, limit: usize, separator: u8) -> TakeLines<R> {
    TakeLines {
        inner: reader,
        limit,
        separator,
    }
 }
 #[cfg(test)]
 mod tests {
    use std::io::BufRead;
    use std::io::BufReader;
    use crate::take::take_all_but;
    use crate::take::take_lines;
    #[test]
    fn test_fewer_elements() {
@ -90,4 +147,33 @@ mod tests {
        assert_eq!(Some(&2), iter.next());
        assert_eq!(None, iter.next());
    }
    #[test]
    fn test_zero_lines() {
        let input_reader = std::io::Cursor::new("a\nb\nc\n");
        let output_reader = BufReader::new(take_lines(input_reader, 0, b'\n'));
        let mut iter = output_reader.lines().map(|l| l.unwrap());
        assert_eq!(None, iter.next());
    }
    #[test]
    fn test_fewer_lines() {
        let input_reader = std::io::Cursor::new("a\nb\nc\n");
        let output_reader = BufReader::new(take_lines(input_reader, 2, b'\n'));
        let mut iter = output_reader.lines().map(|l| l.unwrap());
        assert_eq!(Some(String::from("a")), iter.next());
        assert_eq!(Some(String::from("b")), iter.next());
        assert_eq!(None, iter.next());
    }
    #[test]
    fn test_more_lines() {
        let input_reader = std::io::Cursor::new("a\nb\nc\n");
        let output_reader = BufReader::new(take_lines(input_reader, 4, b'\n'));
        let mut iter = output_reader.lines().map(|l| l.unwrap());
        assert_eq!(Some(String::from("a")), iter.next());
        assert_eq!(Some(String::from("b")), iter.next());
        assert_eq!(Some(String::from("c")), iter.next());
        assert_eq!(None, iter.next());
    }
 }