mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-27 19:17:43 +00:00
head: use std::io::copy() with TakeLines reader
Replace the custom `split::walk_lines()` function with a call to `std::io::copy()`, using a new `TakeLines` reader as the source and `stdout` as the destination. The `TakeLines` reader is an adaptor that scans the bytes being read for line ending characters and stops the reading after a given number of lines has been read (similar to the `std::io::Take` adaptor). This change * makes the `read_n_lines()` function more concise, * allows it to mirror the implementation of `read_n_bytes()`, * increases the speed of `head -n NUM`.
This commit is contained in:
parent
c43436d50a
commit
858b0a9e9f
6 changed files with 146 additions and 82 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -2507,6 +2507,7 @@ name = "uu_head"
|
||||||
version = "0.0.7"
|
version = "0.0.7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
|
"memchr 2.4.0",
|
||||||
"uucore",
|
"uucore",
|
||||||
"uucore_procs",
|
"uucore_procs",
|
||||||
]
|
]
|
||||||
|
|
41
src/uu/head/BENCHMARKING.md
Normal file
41
src/uu/head/BENCHMARKING.md
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
# Benchmarking to measure performance
|
||||||
|
|
||||||
|
To compare the performance of the `uutils` version of `head` with the
|
||||||
|
GNU version of `head`, you can use a benchmarking tool like
|
||||||
|
[hyperfine][0]. On Ubuntu 18.04 or later, you can install `hyperfine` by
|
||||||
|
running
|
||||||
|
|
||||||
|
sudo apt-get install hyperfine
|
||||||
|
|
||||||
|
Next, build the `head` binary under the release profile:
|
||||||
|
|
||||||
|
cargo build --release -p uu_head
|
||||||
|
|
||||||
|
Now, get a text file to test `head` on. I used the *Complete Works of
|
||||||
|
William Shakespeare*, which is in the public domain in the United States
|
||||||
|
and most other parts of the world.
|
||||||
|
|
||||||
|
wget -O shakespeare.txt https://www.gutenberg.org/files/100/100-0.txt
|
||||||
|
|
||||||
|
This particular file has about 170,000 lines, each of which is no longer
|
||||||
|
than 96 characters:
|
||||||
|
|
||||||
|
$ wc -lL shakespeare.txt
|
||||||
|
170592 96 shakespeare.txt
|
||||||
|
|
||||||
|
You could use files of different shapes and sizes to test the
|
||||||
|
performance of `head` in different situations. For a larger file, you
|
||||||
|
could download a [database dump of Wikidata][1] or some related files
|
||||||
|
that the Wikimedia project provides. For example, [this file][2]
|
||||||
|
contains about 130 million lines.
|
||||||
|
|
||||||
|
Finally, you can compare the performance of the two versions of `head`
|
||||||
|
by running, for example,
|
||||||
|
|
||||||
|
hyperfine \
|
||||||
|
"head -n 100000 shakespeare.txt" \
|
||||||
|
"target/release/head -n 100000 shakespeare.txt"
|
||||||
|
|
||||||
|
[0]: https://github.com/sharkdp/hyperfine
|
||||||
|
[1]: https://www.wikidata.org/wiki/Wikidata:Database_download
|
||||||
|
[2]: https://dumps.wikimedia.org/wikidatawiki/20211001/wikidatawiki-20211001-pages-logging.xml.gz
|
|
@ -16,6 +16,7 @@ path = "src/head.rs"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
clap = { version = "2.33", features = ["wrap_help"] }
|
clap = { version = "2.33", features = ["wrap_help"] }
|
||||||
|
memchr = "2"
|
||||||
uucore = { version=">=0.0.9", package="uucore", path="../../uucore", features=["ringbuffer"] }
|
uucore = { version=">=0.0.9", package="uucore", path="../../uucore", features=["ringbuffer"] }
|
||||||
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
|
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
|
||||||
|
|
||||||
|
|
|
@ -3,12 +3,12 @@
|
||||||
// * For the full copyright and license information, please view the LICENSE
|
// * For the full copyright and license information, please view the LICENSE
|
||||||
// * file that was distributed with this source code.
|
// * file that was distributed with this source code.
|
||||||
|
|
||||||
// spell-checker:ignore (vars) zlines
|
// spell-checker:ignore (vars) zlines BUFWRITER
|
||||||
|
|
||||||
use clap::{crate_version, App, Arg};
|
use clap::{crate_version, App, Arg};
|
||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
use std::ffi::OsString;
|
use std::ffi::OsString;
|
||||||
use std::io::{self, ErrorKind, Read, Seek, SeekFrom, Write};
|
use std::io::{self, BufWriter, ErrorKind, Read, Seek, SeekFrom, Write};
|
||||||
use uucore::display::Quotable;
|
use uucore::display::Quotable;
|
||||||
use uucore::{crash, show_error_custom_description};
|
use uucore::{crash, show_error_custom_description};
|
||||||
|
|
||||||
|
@ -16,6 +16,9 @@ const EXIT_FAILURE: i32 = 1;
|
||||||
const EXIT_SUCCESS: i32 = 0;
|
const EXIT_SUCCESS: i32 = 0;
|
||||||
const BUF_SIZE: usize = 65536;
|
const BUF_SIZE: usize = 65536;
|
||||||
|
|
||||||
|
/// The capacity in bytes for buffered writers.
|
||||||
|
const BUFWRITER_CAPACITY: usize = 16_384; // 16 kilobytes
|
||||||
|
|
||||||
const ABOUT: &str = "\
|
const ABOUT: &str = "\
|
||||||
Print the first 10 lines of each FILE to standard output.\n\
|
Print the first 10 lines of each FILE to standard output.\n\
|
||||||
With more than one FILE, precede each with a header giving the file name.\n\
|
With more than one FILE, precede each with a header giving the file name.\n\
|
||||||
|
@ -35,10 +38,10 @@ mod options {
|
||||||
}
|
}
|
||||||
mod lines;
|
mod lines;
|
||||||
mod parse;
|
mod parse;
|
||||||
mod split;
|
|
||||||
mod take;
|
mod take;
|
||||||
use lines::zlines;
|
use lines::zlines;
|
||||||
use take::take_all_but;
|
use take::take_all_but;
|
||||||
|
use take::take_lines;
|
||||||
|
|
||||||
pub fn uu_app() -> App<'static, 'static> {
|
pub fn uu_app() -> App<'static, 'static> {
|
||||||
App::new(uucore::util_name())
|
App::new(uucore::util_name())
|
||||||
|
@ -209,26 +212,18 @@ where
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_n_lines(input: &mut impl std::io::BufRead, n: usize, zero: bool) -> std::io::Result<()> {
|
fn read_n_lines(input: &mut impl std::io::BufRead, n: usize, zero: bool) -> std::io::Result<()> {
|
||||||
if n == 0 {
|
// Read the first `n` lines from the `input` reader.
|
||||||
return Ok(());
|
let separator = if zero { b'\0' } else { b'\n' };
|
||||||
}
|
let mut reader = take_lines(input, n, separator);
|
||||||
|
|
||||||
|
// Write those bytes to `stdout`.
|
||||||
let stdout = std::io::stdout();
|
let stdout = std::io::stdout();
|
||||||
let mut stdout = stdout.lock();
|
let stdout = stdout.lock();
|
||||||
let mut lines = 0usize;
|
let mut writer = BufWriter::with_capacity(BUFWRITER_CAPACITY, stdout);
|
||||||
split::walk_lines(input, zero, |e| match e {
|
|
||||||
split::Event::Data(dat) => {
|
io::copy(&mut reader, &mut writer)?;
|
||||||
stdout.write_all(dat)?;
|
|
||||||
Ok(true)
|
Ok(())
|
||||||
}
|
|
||||||
split::Event::Line => {
|
|
||||||
lines += 1;
|
|
||||||
if lines == n {
|
|
||||||
Ok(false)
|
|
||||||
} else {
|
|
||||||
Ok(true)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_but_last_n_bytes(input: &mut impl std::io::BufRead, n: usize) -> std::io::Result<()> {
|
fn read_but_last_n_bytes(input: &mut impl std::io::BufRead, n: usize) -> std::io::Result<()> {
|
||||||
|
|
|
@ -1,60 +0,0 @@
|
||||||
#[derive(Debug)]
|
|
||||||
pub enum Event<'a> {
|
|
||||||
Data(&'a [u8]),
|
|
||||||
Line,
|
|
||||||
}
|
|
||||||
/// Loops over the lines read from a BufRead.
|
|
||||||
/// # Arguments
|
|
||||||
/// * `input` the ReadBuf to read from
|
|
||||||
/// * `zero` whether to use 0u8 as a line delimiter
|
|
||||||
/// * `on_event` a closure receiving some bytes read in a slice, or
|
|
||||||
/// event signalling a line was just read.
|
|
||||||
/// this is guaranteed to be signalled *directly* after the
|
|
||||||
/// slice containing the (CR on win)LF / 0 is passed
|
|
||||||
///
|
|
||||||
/// Return whether to continue
|
|
||||||
pub fn walk_lines<F>(
|
|
||||||
input: &mut impl std::io::BufRead,
|
|
||||||
zero: bool,
|
|
||||||
mut on_event: F,
|
|
||||||
) -> std::io::Result<()>
|
|
||||||
where
|
|
||||||
F: FnMut(Event) -> std::io::Result<bool>,
|
|
||||||
{
|
|
||||||
let mut buffer = [0u8; super::BUF_SIZE];
|
|
||||||
loop {
|
|
||||||
let read = loop {
|
|
||||||
match input.read(&mut buffer) {
|
|
||||||
Ok(n) => break n,
|
|
||||||
Err(e) => match e.kind() {
|
|
||||||
std::io::ErrorKind::Interrupted => {}
|
|
||||||
_ => return Err(e),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
};
|
|
||||||
if read == 0 {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
let mut base = 0usize;
|
|
||||||
for (i, byte) in buffer[..read].iter().enumerate() {
|
|
||||||
match byte {
|
|
||||||
b'\n' if !zero => {
|
|
||||||
on_event(Event::Data(&buffer[base..=i]))?;
|
|
||||||
base = i + 1;
|
|
||||||
if !on_event(Event::Line)? {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
0u8 if zero => {
|
|
||||||
on_event(Event::Data(&buffer[base..=i]))?;
|
|
||||||
base = i + 1;
|
|
||||||
if !on_event(Event::Line)? {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
on_event(Event::Data(&buffer[base..read]))?;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,4 +1,8 @@
|
||||||
//! Take all but the last elements of an iterator.
|
//! Take all but the last elements of an iterator.
|
||||||
|
use std::io::Read;
|
||||||
|
|
||||||
|
use memchr::memchr_iter;
|
||||||
|
|
||||||
use uucore::ringbuffer::RingBuffer;
|
use uucore::ringbuffer::RingBuffer;
|
||||||
|
|
||||||
/// Create an iterator over all but the last `n` elements of `iter`.
|
/// Create an iterator over all but the last `n` elements of `iter`.
|
||||||
|
@ -58,10 +62,63 @@ where
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Like `std::io::Take`, but for lines instead of bytes.
|
||||||
|
///
|
||||||
|
/// This struct is generally created by calling [`take_lines`] on a
|
||||||
|
/// reader. Please see the documentation of [`take`] for more
|
||||||
|
/// details.
|
||||||
|
pub struct TakeLines<T> {
|
||||||
|
inner: T,
|
||||||
|
limit: usize,
|
||||||
|
separator: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Read> Read for TakeLines<T> {
|
||||||
|
/// Read bytes from a buffer up to the requested number of lines.
|
||||||
|
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
||||||
|
if self.limit == 0 {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
match self.inner.read(buf) {
|
||||||
|
Ok(0) => Ok(0),
|
||||||
|
Ok(n) => {
|
||||||
|
for i in memchr_iter(self.separator, &buf[..n]) {
|
||||||
|
self.limit -= 1;
|
||||||
|
if self.limit == 0 {
|
||||||
|
return Ok(i + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(n)
|
||||||
|
}
|
||||||
|
Err(e) => Err(e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create an adaptor that will read at most `limit` lines from a given reader.
|
||||||
|
///
|
||||||
|
/// This function returns a new instance of `Read` that will read at
|
||||||
|
/// most `limit` lines, after which it will always return EOF
|
||||||
|
/// (`Ok(0)`).
|
||||||
|
///
|
||||||
|
/// The `separator` defines the character to interpret as the line
|
||||||
|
/// ending. For the usual notion of "line", set this to `b'\n'`.
|
||||||
|
pub fn take_lines<R>(reader: R, limit: usize, separator: u8) -> TakeLines<R> {
|
||||||
|
TakeLines {
|
||||||
|
inner: reader,
|
||||||
|
limit,
|
||||||
|
separator,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
|
use std::io::BufRead;
|
||||||
|
use std::io::BufReader;
|
||||||
|
|
||||||
use crate::take::take_all_but;
|
use crate::take::take_all_but;
|
||||||
|
use crate::take::take_lines;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_fewer_elements() {
|
fn test_fewer_elements() {
|
||||||
|
@ -90,4 +147,33 @@ mod tests {
|
||||||
assert_eq!(Some(&2), iter.next());
|
assert_eq!(Some(&2), iter.next());
|
||||||
assert_eq!(None, iter.next());
|
assert_eq!(None, iter.next());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_zero_lines() {
|
||||||
|
let input_reader = std::io::Cursor::new("a\nb\nc\n");
|
||||||
|
let output_reader = BufReader::new(take_lines(input_reader, 0, b'\n'));
|
||||||
|
let mut iter = output_reader.lines().map(|l| l.unwrap());
|
||||||
|
assert_eq!(None, iter.next());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_fewer_lines() {
|
||||||
|
let input_reader = std::io::Cursor::new("a\nb\nc\n");
|
||||||
|
let output_reader = BufReader::new(take_lines(input_reader, 2, b'\n'));
|
||||||
|
let mut iter = output_reader.lines().map(|l| l.unwrap());
|
||||||
|
assert_eq!(Some(String::from("a")), iter.next());
|
||||||
|
assert_eq!(Some(String::from("b")), iter.next());
|
||||||
|
assert_eq!(None, iter.next());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_more_lines() {
|
||||||
|
let input_reader = std::io::Cursor::new("a\nb\nc\n");
|
||||||
|
let output_reader = BufReader::new(take_lines(input_reader, 4, b'\n'));
|
||||||
|
let mut iter = output_reader.lines().map(|l| l.unwrap());
|
||||||
|
assert_eq!(Some(String::from("a")), iter.next());
|
||||||
|
assert_eq!(Some(String::from("b")), iter.next());
|
||||||
|
assert_eq!(Some(String::from("c")), iter.next());
|
||||||
|
assert_eq!(None, iter.next());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue