mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-08-05 07:27:46 +00:00
tail: Performace improvements
Improve performance of `tail` utility. Tail now uses performance-optimized memchr APIs when searching through a file for delimiters.
This commit is contained in:
parent
3971bb3b0c
commit
b264457c41
1 changed files with 48 additions and 37 deletions
|
@ -3,7 +3,8 @@
|
|||
// For the full copyright and license information, please view the LICENSE
|
||||
// file that was distributed with this source code.
|
||||
|
||||
// spell-checker:ignore (ToDO) seekable seek'd tail'ing ringbuffer ringbuf unwatch Uncategorized filehandle Signum
|
||||
// spell-checker:ignore (ToDO) seekable seek'd tail'ing ringbuffer ringbuf unwatch
|
||||
// spell-checker:ignore (ToDO) Uncategorized filehandle Signum memrchr
|
||||
// spell-checker:ignore (libs) kqueue
|
||||
// spell-checker:ignore (acronyms)
|
||||
// spell-checker:ignore (env/flags)
|
||||
|
@ -24,11 +25,12 @@ pub use args::uu_app;
|
|||
use args::{FilterMode, Settings, Signum, parse_args};
|
||||
use chunks::ReverseChunks;
|
||||
use follow::Observer;
|
||||
use memchr::{memchr_iter, memrchr_iter};
|
||||
use paths::{FileExtTail, HeaderPrinter, Input, InputKind, MetadataExtTail};
|
||||
use same_file::Handle;
|
||||
use std::cmp::Ordering;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufRead, BufReader, BufWriter, Read, Seek, SeekFrom, Write, stdin, stdout};
|
||||
use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Seek, SeekFrom, Write, stdin, stdout};
|
||||
use std::path::{Path, PathBuf};
|
||||
use uucore::display::Quotable;
|
||||
use uucore::error::{FromIo, UResult, USimpleError, get_exit_code, set_exit_code};
|
||||
|
@ -285,35 +287,43 @@ fn tail_stdin(
|
|||
/// let i = forwards_thru_file(&mut reader, 2, b'\n').unwrap();
|
||||
/// assert_eq!(i, 2);
|
||||
/// ```
|
||||
fn forwards_thru_file<R>(
|
||||
reader: &mut R,
|
||||
fn forwards_thru_file(
|
||||
reader: &mut impl Read,
|
||||
num_delimiters: u64,
|
||||
delimiter: u8,
|
||||
) -> std::io::Result<usize>
|
||||
where
|
||||
R: Read,
|
||||
{
|
||||
let mut reader = BufReader::new(reader);
|
||||
|
||||
let mut buf = vec![];
|
||||
) -> std::io::Result<usize> {
|
||||
// If num_delimiters == 0, always return 0.
|
||||
if num_delimiters == 0 {
|
||||
return Ok(0);
|
||||
}
|
||||
// Use a 32K buffer.
|
||||
let mut buf = [0; 32 * 1024];
|
||||
let mut total = 0;
|
||||
for _ in 0..num_delimiters {
|
||||
match reader.read_until(delimiter, &mut buf) {
|
||||
Ok(0) => {
|
||||
return Ok(total);
|
||||
}
|
||||
let mut count = 0;
|
||||
// Iterate through the input, using `count` to record the number of times `delimiter`
|
||||
// is seen. Once we find `num_delimiters` instances, return the offset of the byte
|
||||
// immediately following that delimiter.
|
||||
loop {
|
||||
match reader.read(&mut buf) {
|
||||
// Ok(0) => EoF before we found `num_delimiters` instance of `delimiter`.
|
||||
// Return the total number of bytes read in that case.
|
||||
Ok(0) => return Ok(total),
|
||||
Ok(n) => {
|
||||
// Use memchr_iter since it greatly improves search performance.
|
||||
for offset in memchr_iter(delimiter, &buf[..n]) {
|
||||
count += 1;
|
||||
if count == num_delimiters {
|
||||
// Return offset of the byte after the `delimiter` instance.
|
||||
return Ok(total + offset + 1);
|
||||
}
|
||||
}
|
||||
total += n;
|
||||
buf.clear();
|
||||
continue;
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
Err(e) if e.kind() == ErrorKind::Interrupted => continue,
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(total)
|
||||
}
|
||||
|
||||
/// Iterate over bytes in the file, in reverse, until we find the
|
||||
/// `num_delimiters` instance of `delimiter`. The `file` is left seek'd to the
|
||||
|
@ -322,28 +332,30 @@ fn backwards_thru_file(file: &mut File, num_delimiters: u64, delimiter: u8) {
|
|||
// This variable counts the number of delimiters found in the file
|
||||
// so far (reading from the end of the file toward the beginning).
|
||||
let mut counter = 0;
|
||||
|
||||
for (block_idx, slice) in ReverseChunks::new(file).enumerate() {
|
||||
let mut first_slice = true;
|
||||
for slice in ReverseChunks::new(file) {
|
||||
// Iterate over each byte in the slice in reverse order.
|
||||
let mut iter = slice.iter().enumerate().rev();
|
||||
let mut iter = memrchr_iter(delimiter, &slice);
|
||||
|
||||
// Ignore a trailing newline in the last block, if there is one.
|
||||
if block_idx == 0 {
|
||||
if first_slice {
|
||||
if let Some(c) = slice.last() {
|
||||
if *c == delimiter {
|
||||
iter.next();
|
||||
}
|
||||
}
|
||||
first_slice = false;
|
||||
}
|
||||
|
||||
// For each byte, increment the count of the number of
|
||||
// delimiters found. If we have found more than the specified
|
||||
// number of delimiters, terminate the search and seek to the
|
||||
// appropriate location in the file.
|
||||
for (i, ch) in iter {
|
||||
if *ch == delimiter {
|
||||
for i in iter {
|
||||
counter += 1;
|
||||
if counter >= num_delimiters {
|
||||
// We should never over-count - assert that.
|
||||
assert_eq!(counter, num_delimiters);
|
||||
// After each iteration of the outer loop, the
|
||||
// cursor in the file is at the *beginning* of the
|
||||
// block, so seeking forward by `i + 1` bytes puts
|
||||
|
@ -354,7 +366,6 @@ fn backwards_thru_file(file: &mut File, num_delimiters: u64, delimiter: u8) {
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// When tail'ing a file, we do not need to read the whole file from start to
|
||||
/// finish just to find the last n lines or bytes. Instead, we can seek to the
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue