1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-27 11:07:44 +00:00

Merge pull request #2712 from jfinkels/head-take-lines-reader

head: use std::io::copy() with TakeLines reader
This commit is contained in:
Sylvestre Ledru 2021-10-23 17:53:44 +02:00 committed by GitHub
commit 811698b658
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 146 additions and 82 deletions

1
Cargo.lock generated
View file

@ -2507,6 +2507,7 @@ name = "uu_head"
version = "0.0.7" version = "0.0.7"
dependencies = [ dependencies = [
"clap", "clap",
"memchr 2.4.0",
"uucore", "uucore",
"uucore_procs", "uucore_procs",
] ]

View file

@ -0,0 +1,41 @@
# Benchmarking to measure performance
To compare the performance of the `uutils` version of `head` with the
GNU version of `head`, you can use a benchmarking tool like
[hyperfine][0]. On Ubuntu 18.04 or later, you can install `hyperfine` by
running
sudo apt-get install hyperfine
Next, build the `head` binary under the release profile:
cargo build --release -p uu_head
Now, get a text file to test `head` on. I used the *Complete Works of
William Shakespeare*, which is in the public domain in the United States
and most other parts of the world.
wget -O shakespeare.txt https://www.gutenberg.org/files/100/100-0.txt
This particular file has about 170,000 lines, each of which is no longer
than 96 characters:
$ wc -lL shakespeare.txt
170592 96 shakespeare.txt
You could use files of different shapes and sizes to test the
performance of `head` in different situations. For a larger file, you
could download a [database dump of Wikidata][1] or some related files
that the Wikimedia project provides. For example, [this file][2]
contains about 130 million lines.
Finally, you can compare the performance of the two versions of `head`
by running, for example,
hyperfine \
"head -n 100000 shakespeare.txt" \
"target/release/head -n 100000 shakespeare.txt"
[0]: https://github.com/sharkdp/hyperfine
[1]: https://www.wikidata.org/wiki/Wikidata:Database_download
[2]: https://dumps.wikimedia.org/wikidatawiki/20211001/wikidatawiki-20211001-pages-logging.xml.gz

View file

@ -16,6 +16,7 @@ path = "src/head.rs"
[dependencies] [dependencies]
clap = { version = "2.33", features = ["wrap_help"] } clap = { version = "2.33", features = ["wrap_help"] }
memchr = "2"
uucore = { version=">=0.0.9", package="uucore", path="../../uucore", features=["ringbuffer"] } uucore = { version=">=0.0.9", package="uucore", path="../../uucore", features=["ringbuffer"] }
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" } uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }

View file

@ -3,18 +3,21 @@
// * For the full copyright and license information, please view the LICENSE // * For the full copyright and license information, please view the LICENSE
// * file that was distributed with this source code. // * file that was distributed with this source code.
// spell-checker:ignore (vars) zlines // spell-checker:ignore (vars) zlines BUFWRITER
use clap::{crate_version, App, Arg}; use clap::{crate_version, App, Arg};
use std::convert::TryFrom; use std::convert::TryFrom;
use std::ffi::OsString; use std::ffi::OsString;
use std::io::{self, ErrorKind, Read, Seek, SeekFrom, Write}; use std::io::{self, BufWriter, ErrorKind, Read, Seek, SeekFrom, Write};
use uucore::display::Quotable; use uucore::display::Quotable;
use uucore::error::{UResult, USimpleError}; use uucore::error::{UResult, USimpleError};
use uucore::show_error_custom_description; use uucore::show_error_custom_description;
const BUF_SIZE: usize = 65536; const BUF_SIZE: usize = 65536;
/// The capacity in bytes for buffered writers.
const BUFWRITER_CAPACITY: usize = 16_384; // 16 kilobytes
const ABOUT: &str = "\ const ABOUT: &str = "\
Print the first 10 lines of each FILE to standard output.\n\ Print the first 10 lines of each FILE to standard output.\n\
With more than one FILE, precede each with a header giving the file name.\n\ With more than one FILE, precede each with a header giving the file name.\n\
@ -34,10 +37,10 @@ mod options {
} }
mod lines; mod lines;
mod parse; mod parse;
mod split;
mod take; mod take;
use lines::zlines; use lines::zlines;
use take::take_all_but; use take::take_all_but;
use take::take_lines;
pub fn uu_app() -> App<'static, 'static> { pub fn uu_app() -> App<'static, 'static> {
App::new(uucore::util_name()) App::new(uucore::util_name())
@ -208,26 +211,18 @@ where
} }
fn read_n_lines(input: &mut impl std::io::BufRead, n: usize, zero: bool) -> std::io::Result<()> { fn read_n_lines(input: &mut impl std::io::BufRead, n: usize, zero: bool) -> std::io::Result<()> {
if n == 0 { // Read the first `n` lines from the `input` reader.
return Ok(()); let separator = if zero { b'\0' } else { b'\n' };
} let mut reader = take_lines(input, n, separator);
// Write those bytes to `stdout`.
let stdout = std::io::stdout(); let stdout = std::io::stdout();
let mut stdout = stdout.lock(); let stdout = stdout.lock();
let mut lines = 0usize; let mut writer = BufWriter::with_capacity(BUFWRITER_CAPACITY, stdout);
split::walk_lines(input, zero, |e| match e {
split::Event::Data(dat) => { io::copy(&mut reader, &mut writer)?;
stdout.write_all(dat)?;
Ok(true) Ok(())
}
split::Event::Line => {
lines += 1;
if lines == n {
Ok(false)
} else {
Ok(true)
}
}
})
} }
fn read_but_last_n_bytes(input: &mut impl std::io::BufRead, n: usize) -> std::io::Result<()> { fn read_but_last_n_bytes(input: &mut impl std::io::BufRead, n: usize) -> std::io::Result<()> {

View file

@ -1,60 +0,0 @@
#[derive(Debug)]
pub enum Event<'a> {
Data(&'a [u8]),
Line,
}
/// Loops over the lines read from a BufRead.
/// # Arguments
/// * `input` the ReadBuf to read from
/// * `zero` whether to use 0u8 as a line delimiter
/// * `on_event` a closure receiving some bytes read in a slice, or
/// event signalling a line was just read.
/// this is guaranteed to be signalled *directly* after the
/// slice containing the (CR on win)LF / 0 is passed
///
/// Return whether to continue
pub fn walk_lines<F>(
input: &mut impl std::io::BufRead,
zero: bool,
mut on_event: F,
) -> std::io::Result<()>
where
F: FnMut(Event) -> std::io::Result<bool>,
{
let mut buffer = [0u8; super::BUF_SIZE];
loop {
let read = loop {
match input.read(&mut buffer) {
Ok(n) => break n,
Err(e) => match e.kind() {
std::io::ErrorKind::Interrupted => {}
_ => return Err(e),
},
}
};
if read == 0 {
return Ok(());
}
let mut base = 0usize;
for (i, byte) in buffer[..read].iter().enumerate() {
match byte {
b'\n' if !zero => {
on_event(Event::Data(&buffer[base..=i]))?;
base = i + 1;
if !on_event(Event::Line)? {
return Ok(());
}
}
0u8 if zero => {
on_event(Event::Data(&buffer[base..=i]))?;
base = i + 1;
if !on_event(Event::Line)? {
return Ok(());
}
}
_ => {}
}
}
on_event(Event::Data(&buffer[base..read]))?;
}
}

View file

@ -1,4 +1,8 @@
//! Take all but the last elements of an iterator. //! Take all but the last elements of an iterator.
use std::io::Read;
use memchr::memchr_iter;
use uucore::ringbuffer::RingBuffer; use uucore::ringbuffer::RingBuffer;
/// Create an iterator over all but the last `n` elements of `iter`. /// Create an iterator over all but the last `n` elements of `iter`.
@ -58,10 +62,63 @@ where
} }
} }
/// Like `std::io::Take`, but for lines instead of bytes.
///
/// This struct is generally created by calling [`take_lines`] on a
/// reader. Please see the documentation of [`take`] for more
/// details.
pub struct TakeLines<T> {
inner: T,
limit: usize,
separator: u8,
}
impl<T: Read> Read for TakeLines<T> {
/// Read bytes from a buffer up to the requested number of lines.
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
if self.limit == 0 {
return Ok(0);
}
match self.inner.read(buf) {
Ok(0) => Ok(0),
Ok(n) => {
for i in memchr_iter(self.separator, &buf[..n]) {
self.limit -= 1;
if self.limit == 0 {
return Ok(i + 1);
}
}
Ok(n)
}
Err(e) => Err(e),
}
}
}
/// Create an adaptor that will read at most `limit` lines from a given reader.
///
/// This function returns a new instance of `Read` that will read at
/// most `limit` lines, after which it will always return EOF
/// (`Ok(0)`).
///
/// The `separator` defines the character to interpret as the line
/// ending. For the usual notion of "line", set this to `b'\n'`.
pub fn take_lines<R>(reader: R, limit: usize, separator: u8) -> TakeLines<R> {
TakeLines {
inner: reader,
limit,
separator,
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::io::BufRead;
use std::io::BufReader;
use crate::take::take_all_but; use crate::take::take_all_but;
use crate::take::take_lines;
#[test] #[test]
fn test_fewer_elements() { fn test_fewer_elements() {
@ -90,4 +147,33 @@ mod tests {
assert_eq!(Some(&2), iter.next()); assert_eq!(Some(&2), iter.next());
assert_eq!(None, iter.next()); assert_eq!(None, iter.next());
} }
#[test]
fn test_zero_lines() {
let input_reader = std::io::Cursor::new("a\nb\nc\n");
let output_reader = BufReader::new(take_lines(input_reader, 0, b'\n'));
let mut iter = output_reader.lines().map(|l| l.unwrap());
assert_eq!(None, iter.next());
}
#[test]
fn test_fewer_lines() {
let input_reader = std::io::Cursor::new("a\nb\nc\n");
let output_reader = BufReader::new(take_lines(input_reader, 2, b'\n'));
let mut iter = output_reader.lines().map(|l| l.unwrap());
assert_eq!(Some(String::from("a")), iter.next());
assert_eq!(Some(String::from("b")), iter.next());
assert_eq!(None, iter.next());
}
#[test]
fn test_more_lines() {
let input_reader = std::io::Cursor::new("a\nb\nc\n");
let output_reader = BufReader::new(take_lines(input_reader, 4, b'\n'));
let mut iter = output_reader.lines().map(|l| l.unwrap());
assert_eq!(Some(String::from("a")), iter.next());
assert_eq!(Some(String::from("b")), iter.next());
assert_eq!(Some(String::from("c")), iter.next());
assert_eq!(None, iter.next());
}
} }