From b264457c41151ae7842c990d2e822332a38b8bd1 Mon Sep 17 00:00:00 2001 From: Karl McDowall Date: Thu, 20 Mar 2025 14:52:33 -0600 Subject: [PATCH] tail: Performace improvements Improve performance of `tail` utility. Tail now uses performance-optimized memchr APIs when searching through a file for delimiters. --- src/uu/tail/src/tail.rs | 85 +++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 37 deletions(-) diff --git a/src/uu/tail/src/tail.rs b/src/uu/tail/src/tail.rs index 11af1a685..4bf7841c2 100644 --- a/src/uu/tail/src/tail.rs +++ b/src/uu/tail/src/tail.rs @@ -3,7 +3,8 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore (ToDO) seekable seek'd tail'ing ringbuffer ringbuf unwatch Uncategorized filehandle Signum +// spell-checker:ignore (ToDO) seekable seek'd tail'ing ringbuffer ringbuf unwatch +// spell-checker:ignore (ToDO) Uncategorized filehandle Signum memrchr // spell-checker:ignore (libs) kqueue // spell-checker:ignore (acronyms) // spell-checker:ignore (env/flags) @@ -24,11 +25,12 @@ pub use args::uu_app; use args::{FilterMode, Settings, Signum, parse_args}; use chunks::ReverseChunks; use follow::Observer; +use memchr::{memchr_iter, memrchr_iter}; use paths::{FileExtTail, HeaderPrinter, Input, InputKind, MetadataExtTail}; use same_file::Handle; use std::cmp::Ordering; use std::fs::File; -use std::io::{self, BufRead, BufReader, BufWriter, Read, Seek, SeekFrom, Write, stdin, stdout}; +use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Seek, SeekFrom, Write, stdin, stdout}; use std::path::{Path, PathBuf}; use uucore::display::Quotable; use uucore::error::{FromIo, UResult, USimpleError, get_exit_code, set_exit_code}; @@ -285,34 +287,42 @@ fn tail_stdin( /// let i = forwards_thru_file(&mut reader, 2, b'\n').unwrap(); /// assert_eq!(i, 2); /// ``` -fn forwards_thru_file( - reader: &mut R, +fn forwards_thru_file( + reader: &mut impl Read, num_delimiters: u64, delimiter: u8, -) -> std::io::Result -where - R: Read, -{ - let mut reader = BufReader::new(reader); - - let mut buf = vec![]; +) -> std::io::Result { + // If num_delimiters == 0, always return 0. + if num_delimiters == 0 { + return Ok(0); + } + // Use a 32K buffer. + let mut buf = [0; 32 * 1024]; let mut total = 0; - for _ in 0..num_delimiters { - match reader.read_until(delimiter, &mut buf) { - Ok(0) => { - return Ok(total); - } + let mut count = 0; + // Iterate through the input, using `count` to record the number of times `delimiter` + // is seen. Once we find `num_delimiters` instances, return the offset of the byte + // immediately following that delimiter. + loop { + match reader.read(&mut buf) { + // Ok(0) => EoF before we found `num_delimiters` instance of `delimiter`. + // Return the total number of bytes read in that case. + Ok(0) => return Ok(total), Ok(n) => { + // Use memchr_iter since it greatly improves search performance. + for offset in memchr_iter(delimiter, &buf[..n]) { + count += 1; + if count == num_delimiters { + // Return offset of the byte after the `delimiter` instance. + return Ok(total + offset + 1); + } + } total += n; - buf.clear(); - continue; - } - Err(e) => { - return Err(e); } + Err(e) if e.kind() == ErrorKind::Interrupted => continue, + Err(e) => return Err(e), } } - Ok(total) } /// Iterate over bytes in the file, in reverse, until we find the @@ -322,35 +332,36 @@ fn backwards_thru_file(file: &mut File, num_delimiters: u64, delimiter: u8) { // This variable counts the number of delimiters found in the file // so far (reading from the end of the file toward the beginning). let mut counter = 0; - - for (block_idx, slice) in ReverseChunks::new(file).enumerate() { + let mut first_slice = true; + for slice in ReverseChunks::new(file) { // Iterate over each byte in the slice in reverse order. - let mut iter = slice.iter().enumerate().rev(); + let mut iter = memrchr_iter(delimiter, &slice); // Ignore a trailing newline in the last block, if there is one. - if block_idx == 0 { + if first_slice { if let Some(c) = slice.last() { if *c == delimiter { iter.next(); } } + first_slice = false; } // For each byte, increment the count of the number of // delimiters found. If we have found more than the specified // number of delimiters, terminate the search and seek to the // appropriate location in the file. - for (i, ch) in iter { - if *ch == delimiter { - counter += 1; - if counter >= num_delimiters { - // After each iteration of the outer loop, the - // cursor in the file is at the *beginning* of the - // block, so seeking forward by `i + 1` bytes puts - // us right after the found delimiter. - file.seek(SeekFrom::Current((i + 1) as i64)).unwrap(); - return; - } + for i in iter { + counter += 1; + if counter >= num_delimiters { + // We should never over-count - assert that. + assert_eq!(counter, num_delimiters); + // After each iteration of the outer loop, the + // cursor in the file is at the *beginning* of the + // block, so seeking forward by `i + 1` bytes puts + // us right after the found delimiter. + file.seek(SeekFrom::Current((i + 1) as i64)).unwrap(); + return; } } }