mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-27 11:07:44 +00:00
Merge pull request #2712 from jfinkels/head-take-lines-reader
head: use std::io::copy() with TakeLines reader
This commit is contained in:
commit
811698b658
6 changed files with 146 additions and 82 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -2507,6 +2507,7 @@ name = "uu_head"
|
|||
version = "0.0.7"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"memchr 2.4.0",
|
||||
"uucore",
|
||||
"uucore_procs",
|
||||
]
|
||||
|
|
41
src/uu/head/BENCHMARKING.md
Normal file
41
src/uu/head/BENCHMARKING.md
Normal file
|
@ -0,0 +1,41 @@
|
|||
# Benchmarking to measure performance
|
||||
|
||||
To compare the performance of the `uutils` version of `head` with the
|
||||
GNU version of `head`, you can use a benchmarking tool like
|
||||
[hyperfine][0]. On Ubuntu 18.04 or later, you can install `hyperfine` by
|
||||
running
|
||||
|
||||
sudo apt-get install hyperfine
|
||||
|
||||
Next, build the `head` binary under the release profile:
|
||||
|
||||
cargo build --release -p uu_head
|
||||
|
||||
Now, get a text file to test `head` on. I used the *Complete Works of
|
||||
William Shakespeare*, which is in the public domain in the United States
|
||||
and most other parts of the world.
|
||||
|
||||
wget -O shakespeare.txt https://www.gutenberg.org/files/100/100-0.txt
|
||||
|
||||
This particular file has about 170,000 lines, each of which is no longer
|
||||
than 96 characters:
|
||||
|
||||
$ wc -lL shakespeare.txt
|
||||
170592 96 shakespeare.txt
|
||||
|
||||
You could use files of different shapes and sizes to test the
|
||||
performance of `head` in different situations. For a larger file, you
|
||||
could download a [database dump of Wikidata][1] or some related files
|
||||
that the Wikimedia project provides. For example, [this file][2]
|
||||
contains about 130 million lines.
|
||||
|
||||
Finally, you can compare the performance of the two versions of `head`
|
||||
by running, for example,
|
||||
|
||||
hyperfine \
|
||||
"head -n 100000 shakespeare.txt" \
|
||||
"target/release/head -n 100000 shakespeare.txt"
|
||||
|
||||
[0]: https://github.com/sharkdp/hyperfine
|
||||
[1]: https://www.wikidata.org/wiki/Wikidata:Database_download
|
||||
[2]: https://dumps.wikimedia.org/wikidatawiki/20211001/wikidatawiki-20211001-pages-logging.xml.gz
|
|
@ -16,6 +16,7 @@ path = "src/head.rs"
|
|||
|
||||
[dependencies]
|
||||
clap = { version = "2.33", features = ["wrap_help"] }
|
||||
memchr = "2"
|
||||
uucore = { version=">=0.0.9", package="uucore", path="../../uucore", features=["ringbuffer"] }
|
||||
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
|
||||
|
||||
|
|
|
@ -3,18 +3,21 @@
|
|||
// * For the full copyright and license information, please view the LICENSE
|
||||
// * file that was distributed with this source code.
|
||||
|
||||
// spell-checker:ignore (vars) zlines
|
||||
// spell-checker:ignore (vars) zlines BUFWRITER
|
||||
|
||||
use clap::{crate_version, App, Arg};
|
||||
use std::convert::TryFrom;
|
||||
use std::ffi::OsString;
|
||||
use std::io::{self, ErrorKind, Read, Seek, SeekFrom, Write};
|
||||
use std::io::{self, BufWriter, ErrorKind, Read, Seek, SeekFrom, Write};
|
||||
use uucore::display::Quotable;
|
||||
use uucore::error::{UResult, USimpleError};
|
||||
use uucore::show_error_custom_description;
|
||||
|
||||
const BUF_SIZE: usize = 65536;
|
||||
|
||||
/// The capacity in bytes for buffered writers.
|
||||
const BUFWRITER_CAPACITY: usize = 16_384; // 16 kilobytes
|
||||
|
||||
const ABOUT: &str = "\
|
||||
Print the first 10 lines of each FILE to standard output.\n\
|
||||
With more than one FILE, precede each with a header giving the file name.\n\
|
||||
|
@ -34,10 +37,10 @@ mod options {
|
|||
}
|
||||
mod lines;
|
||||
mod parse;
|
||||
mod split;
|
||||
mod take;
|
||||
use lines::zlines;
|
||||
use take::take_all_but;
|
||||
use take::take_lines;
|
||||
|
||||
pub fn uu_app() -> App<'static, 'static> {
|
||||
App::new(uucore::util_name())
|
||||
|
@ -208,26 +211,18 @@ where
|
|||
}
|
||||
|
||||
fn read_n_lines(input: &mut impl std::io::BufRead, n: usize, zero: bool) -> std::io::Result<()> {
|
||||
if n == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
// Read the first `n` lines from the `input` reader.
|
||||
let separator = if zero { b'\0' } else { b'\n' };
|
||||
let mut reader = take_lines(input, n, separator);
|
||||
|
||||
// Write those bytes to `stdout`.
|
||||
let stdout = std::io::stdout();
|
||||
let mut stdout = stdout.lock();
|
||||
let mut lines = 0usize;
|
||||
split::walk_lines(input, zero, |e| match e {
|
||||
split::Event::Data(dat) => {
|
||||
stdout.write_all(dat)?;
|
||||
Ok(true)
|
||||
}
|
||||
split::Event::Line => {
|
||||
lines += 1;
|
||||
if lines == n {
|
||||
Ok(false)
|
||||
} else {
|
||||
Ok(true)
|
||||
}
|
||||
}
|
||||
})
|
||||
let stdout = stdout.lock();
|
||||
let mut writer = BufWriter::with_capacity(BUFWRITER_CAPACITY, stdout);
|
||||
|
||||
io::copy(&mut reader, &mut writer)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_but_last_n_bytes(input: &mut impl std::io::BufRead, n: usize) -> std::io::Result<()> {
|
||||
|
|
|
@ -1,60 +0,0 @@
|
|||
#[derive(Debug)]
|
||||
pub enum Event<'a> {
|
||||
Data(&'a [u8]),
|
||||
Line,
|
||||
}
|
||||
/// Loops over the lines read from a BufRead.
|
||||
/// # Arguments
|
||||
/// * `input` the ReadBuf to read from
|
||||
/// * `zero` whether to use 0u8 as a line delimiter
|
||||
/// * `on_event` a closure receiving some bytes read in a slice, or
|
||||
/// event signalling a line was just read.
|
||||
/// this is guaranteed to be signalled *directly* after the
|
||||
/// slice containing the (CR on win)LF / 0 is passed
|
||||
///
|
||||
/// Return whether to continue
|
||||
pub fn walk_lines<F>(
|
||||
input: &mut impl std::io::BufRead,
|
||||
zero: bool,
|
||||
mut on_event: F,
|
||||
) -> std::io::Result<()>
|
||||
where
|
||||
F: FnMut(Event) -> std::io::Result<bool>,
|
||||
{
|
||||
let mut buffer = [0u8; super::BUF_SIZE];
|
||||
loop {
|
||||
let read = loop {
|
||||
match input.read(&mut buffer) {
|
||||
Ok(n) => break n,
|
||||
Err(e) => match e.kind() {
|
||||
std::io::ErrorKind::Interrupted => {}
|
||||
_ => return Err(e),
|
||||
},
|
||||
}
|
||||
};
|
||||
if read == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
let mut base = 0usize;
|
||||
for (i, byte) in buffer[..read].iter().enumerate() {
|
||||
match byte {
|
||||
b'\n' if !zero => {
|
||||
on_event(Event::Data(&buffer[base..=i]))?;
|
||||
base = i + 1;
|
||||
if !on_event(Event::Line)? {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
0u8 if zero => {
|
||||
on_event(Event::Data(&buffer[base..=i]))?;
|
||||
base = i + 1;
|
||||
if !on_event(Event::Line)? {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
on_event(Event::Data(&buffer[base..read]))?;
|
||||
}
|
||||
}
|
|
@ -1,4 +1,8 @@
|
|||
//! Take all but the last elements of an iterator.
|
||||
use std::io::Read;
|
||||
|
||||
use memchr::memchr_iter;
|
||||
|
||||
use uucore::ringbuffer::RingBuffer;
|
||||
|
||||
/// Create an iterator over all but the last `n` elements of `iter`.
|
||||
|
@ -58,10 +62,63 @@ where
|
|||
}
|
||||
}
|
||||
|
||||
/// Like `std::io::Take`, but for lines instead of bytes.
|
||||
///
|
||||
/// This struct is generally created by calling [`take_lines`] on a
|
||||
/// reader. Please see the documentation of [`take`] for more
|
||||
/// details.
|
||||
pub struct TakeLines<T> {
|
||||
inner: T,
|
||||
limit: usize,
|
||||
separator: u8,
|
||||
}
|
||||
|
||||
impl<T: Read> Read for TakeLines<T> {
|
||||
/// Read bytes from a buffer up to the requested number of lines.
|
||||
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
||||
if self.limit == 0 {
|
||||
return Ok(0);
|
||||
}
|
||||
match self.inner.read(buf) {
|
||||
Ok(0) => Ok(0),
|
||||
Ok(n) => {
|
||||
for i in memchr_iter(self.separator, &buf[..n]) {
|
||||
self.limit -= 1;
|
||||
if self.limit == 0 {
|
||||
return Ok(i + 1);
|
||||
}
|
||||
}
|
||||
Ok(n)
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Create an adaptor that will read at most `limit` lines from a given reader.
|
||||
///
|
||||
/// This function returns a new instance of `Read` that will read at
|
||||
/// most `limit` lines, after which it will always return EOF
|
||||
/// (`Ok(0)`).
|
||||
///
|
||||
/// The `separator` defines the character to interpret as the line
|
||||
/// ending. For the usual notion of "line", set this to `b'\n'`.
|
||||
pub fn take_lines<R>(reader: R, limit: usize, separator: u8) -> TakeLines<R> {
|
||||
TakeLines {
|
||||
inner: reader,
|
||||
limit,
|
||||
separator,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::io::BufRead;
|
||||
use std::io::BufReader;
|
||||
|
||||
use crate::take::take_all_but;
|
||||
use crate::take::take_lines;
|
||||
|
||||
#[test]
|
||||
fn test_fewer_elements() {
|
||||
|
@ -90,4 +147,33 @@ mod tests {
|
|||
assert_eq!(Some(&2), iter.next());
|
||||
assert_eq!(None, iter.next());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_zero_lines() {
|
||||
let input_reader = std::io::Cursor::new("a\nb\nc\n");
|
||||
let output_reader = BufReader::new(take_lines(input_reader, 0, b'\n'));
|
||||
let mut iter = output_reader.lines().map(|l| l.unwrap());
|
||||
assert_eq!(None, iter.next());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fewer_lines() {
|
||||
let input_reader = std::io::Cursor::new("a\nb\nc\n");
|
||||
let output_reader = BufReader::new(take_lines(input_reader, 2, b'\n'));
|
||||
let mut iter = output_reader.lines().map(|l| l.unwrap());
|
||||
assert_eq!(Some(String::from("a")), iter.next());
|
||||
assert_eq!(Some(String::from("b")), iter.next());
|
||||
assert_eq!(None, iter.next());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_more_lines() {
|
||||
let input_reader = std::io::Cursor::new("a\nb\nc\n");
|
||||
let output_reader = BufReader::new(take_lines(input_reader, 4, b'\n'));
|
||||
let mut iter = output_reader.lines().map(|l| l.unwrap());
|
||||
assert_eq!(Some(String::from("a")), iter.next());
|
||||
assert_eq!(Some(String::from("b")), iter.next());
|
||||
assert_eq!(Some(String::from("c")), iter.next());
|
||||
assert_eq!(None, iter.next());
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue