diff --git a/Cargo.lock b/Cargo.lock index 3d4cc35be..ea67b34af 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1786,7 +1786,9 @@ dependencies = [ name = "uu_cut" version = "0.0.6" dependencies = [ + "bstr", "clap", + "memchr 2.3.4", "uucore", "uucore_procs", ] diff --git a/src/uu/cut/BENCHMARKING.md b/src/uu/cut/BENCHMARKING.md new file mode 100644 index 000000000..bd94cf93c --- /dev/null +++ b/src/uu/cut/BENCHMARKING.md @@ -0,0 +1,46 @@ +## Benchmarking cut + +### Performance profile + +In normal use cases a significant amount of the total execution time of `cut` +is spent performing I/O. When invoked with the `-f` option (cut fields) some +CPU time is spent on detecting fields (in `Searcher::next`). Other than that +some small amount of CPU time is spent on breaking the input stream into lines. + + +### How to + +When fixing bugs or adding features you might want to compare +performance before and after your code changes. + +- `hyperfine` can be used to accurately measure and compare the total + execution time of one or more commands. + + ``` + $ cargo build --release --package uu_cut + + $ hyperfine -w3 "./target/release/cut -f2-4,8 -d' ' input.txt" "cut -f2-4,8 -d' ' input.txt" + ``` + You can put those two commands in a shell script to be sure that you don't + forget to build after making any changes. + +When optimizing or fixing performance regressions seeing the number of times a +function is called, and the amount of time it takes can be useful. + +- `cargo flamegraph` generates flame graphs from function level metrics it records using `perf` or `dtrace` + + ``` + $ cargo flamegraph --bin cut --package uu_cut -- -f1,3-4 input.txt > /dev/null + ``` + + +### What to benchmark + +There are four different performance paths in `cut` to benchmark. + +- Byte ranges `-c`/`--characters` or `-b`/`--bytes` e.g. `cut -c 2,4,6-` +- Byte ranges with output delimiters e.g. `cut -c 4- --output-delimiter=/` +- Fields e.g. `cut -f -4` +- Fields with output delimiters e.g. `cut -f 7-10 --output-delimiter=:` + +Choose a test input file with large number of lines so that program startup time does not significantly affect the benchmark. diff --git a/src/uu/cut/Cargo.toml b/src/uu/cut/Cargo.toml index d892ddeb5..c863c1772 100644 --- a/src/uu/cut/Cargo.toml +++ b/src/uu/cut/Cargo.toml @@ -18,6 +18,8 @@ path = "src/cut.rs" clap = "2.33" uucore = { version=">=0.0.8", package="uucore", path="../../uucore" } uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" } +memchr = "2" +bstr = "0.2" [[bin]] name = "cut" diff --git a/src/uu/cut/src/buffer.rs b/src/uu/cut/src/buffer.rs deleted file mode 100644 index 6c3238be1..000000000 --- a/src/uu/cut/src/buffer.rs +++ /dev/null @@ -1,152 +0,0 @@ -// This file is part of the uutils coreutils package. -// -// (c) Rolf Morel -// (c) kwantam -// * substantial rewrite to use the `std::io::BufReader` trait -// -// For the full copyright and license information, please view the LICENSE -// file that was distributed with this source code. - -// spell-checker:ignore (ToDO) SRes Newl - -use std::io::Result as IoResult; -use std::io::{BufRead, BufReader, Read, Write}; - -#[allow(non_snake_case)] -pub mod Bytes { - use std::io::Write; - - pub trait Select { - fn select(&mut self, bytes: usize, out: Option<&mut W>) -> Selected; - } - - #[derive(PartialEq, Eq, Debug)] - pub enum Selected { - NewlineFound, - Complete(usize), - Partial(usize), - EndOfFile, - } -} - -#[derive(Debug)] -pub struct ByteReader -where - R: Read, -{ - inner: BufReader, - newline_char: u8, -} - -impl ByteReader { - pub fn new(read: R, newline_char: u8) -> ByteReader { - ByteReader { - inner: BufReader::with_capacity(4096, read), - newline_char, - } - } -} - -impl Read for ByteReader { - fn read(&mut self, buf: &mut [u8]) -> IoResult { - self.inner.read(buf) - } -} - -impl BufRead for ByteReader { - fn fill_buf(&mut self) -> IoResult<&[u8]> { - self.inner.fill_buf() - } - - fn consume(&mut self, amt: usize) { - self.inner.consume(amt) - } -} - -impl ByteReader { - pub fn consume_line(&mut self) -> usize { - let mut bytes_consumed = 0; - let mut consume_val; - let newline_char = self.newline_char; - - loop { - { - // need filled_buf to go out of scope - let filled_buf = match self.fill_buf() { - Ok(b) => { - if b.is_empty() { - return bytes_consumed; - } else { - b - } - } - Err(e) => crash!(1, "read error: {}", e), - }; - - if let Some(idx) = filled_buf.iter().position(|byte| *byte == newline_char) { - consume_val = idx + 1; - bytes_consumed += consume_val; - break; - } - - consume_val = filled_buf.len(); - } - - bytes_consumed += consume_val; - self.consume(consume_val); - } - - self.consume(consume_val); - bytes_consumed - } -} - -impl self::Bytes::Select for ByteReader { - fn select(&mut self, bytes: usize, out: Option<&mut W>) -> Bytes::Selected { - enum SRes { - Comp, - Part, - Newl, - } - - use self::Bytes::Selected::*; - - let newline_char = self.newline_char; - let (res, consume_val) = { - let buffer = match self.fill_buf() { - Err(e) => crash!(1, "read error: {}", e), - Ok(b) => b, - }; - - let (res, consume_val) = match buffer.len() { - 0 => return EndOfFile, - buf_used if bytes < buf_used => { - // because the output delimiter should only be placed between - // segments check if the byte after bytes is a newline - let buf_slice = &buffer[0..=bytes]; - - match buf_slice.iter().position(|byte| *byte == newline_char) { - Some(idx) => (SRes::Newl, idx + 1), - None => (SRes::Comp, bytes), - } - } - _ => match buffer.iter().position(|byte| *byte == newline_char) { - Some(idx) => (SRes::Newl, idx + 1), - None => (SRes::Part, buffer.len()), - }, - }; - - if let Some(out) = out { - crash_if_err!(1, out.write_all(&buffer[0..consume_val])); - } - (res, consume_val) - }; - - self.consume(consume_val); - match res { - SRes::Comp => Complete(consume_val), - SRes::Part => Partial(consume_val), - SRes::Newl => NewlineFound, - } - } -} diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 71c52601e..535027237 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -10,16 +10,17 @@ #[macro_use] extern crate uucore; +use bstr::io::BufReadExt; use clap::{App, Arg}; use std::fs::File; -use std::io::{stdin, stdout, BufRead, BufReader, Read, Stdout, Write}; +use std::io::{stdin, stdout, BufReader, BufWriter, Read, Write}; use std::path::Path; use self::searcher::Searcher; +use uucore::fs::is_stdout_interactive; use uucore::ranges::Range; use uucore::InvalidEncodingHandling; -mod buffer; mod searcher; static NAME: &str = "cut"; @@ -126,6 +127,14 @@ enum Mode { Fields(Vec, FieldOptions), } +fn stdout_writer() -> Box { + if is_stdout_interactive() { + Box::new(stdout()) + } else { + Box::new(BufWriter::new(stdout())) as Box + } +} + fn list_to_ranges(list: &str, complement: bool) -> Result, String> { if complement { Range::from_list(list).map(|r| uucore::ranges::complement(&r)) @@ -135,72 +144,35 @@ fn list_to_ranges(list: &str, complement: bool) -> Result, String> { } fn cut_bytes(reader: R, ranges: &[Range], opts: &Options) -> i32 { - use self::buffer::Bytes::Select; - use self::buffer::Bytes::Selected::*; - let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' }; - let mut buf_read = buffer::ByteReader::new(reader, newline_char); - let mut out = stdout(); + let buf_in = BufReader::new(reader); + let mut out = stdout_writer(); + let delim = opts + .out_delim + .as_ref() + .map_or("", String::as_str) + .as_bytes(); - 'newline: loop { - let mut cur_pos = 1; + let res = buf_in.for_byte_record(newline_char, |line| { let mut print_delim = false; - - for &Range { low, high } in ranges.iter() { - // skip up to low - let orig_pos = cur_pos; - loop { - match buf_read.select(low - cur_pos, None::<&mut Stdout>) { - NewlineFound => { - crash_if_err!(1, out.write_all(&[newline_char])); - continue 'newline; - } - Complete(len) => { - cur_pos += len; - break; - } - Partial(len) => cur_pos += len, - EndOfFile => { - if orig_pos != cur_pos { - crash_if_err!(1, out.write_all(&[newline_char])); - } - - break 'newline; - } - } + for &Range { low, high } in ranges { + if low > line.len() { + break; } - - if let Some(ref delim) = opts.out_delim { - if print_delim { - crash_if_err!(1, out.write_all(delim.as_bytes())); - } + if print_delim { + out.write_all(delim)?; + } else if opts.out_delim.is_some() { print_delim = true; } - - // write out from low to high - loop { - match buf_read.select(high - cur_pos + 1, Some(&mut out)) { - NewlineFound => continue 'newline, - Partial(len) => cur_pos += len, - Complete(_) => { - cur_pos = high + 1; - break; - } - EndOfFile => { - if cur_pos != low || low == high { - crash_if_err!(1, out.write_all(&[newline_char])); - } - - break 'newline; - } - } - } + // change `low` from 1-indexed value to 0-index value + let low = low - 1; + let high = high.min(line.len()); + out.write_all(&line[low..high])?; } - - buf_read.consume_line(); - crash_if_err!(1, out.write_all(&[newline_char])); - } - + out.write_all(&[newline_char])?; + Ok(true) + }); + crash_if_err!(1, res); 0 } @@ -213,23 +185,11 @@ fn cut_fields_delimiter( newline_char: u8, out_delim: &str, ) -> i32 { - let mut buf_in = BufReader::new(reader); - let mut out = stdout(); - let mut buffer = Vec::new(); + let buf_in = BufReader::new(reader); + let mut out = stdout_writer(); + let input_delim_len = delim.len(); - 'newline: loop { - buffer.clear(); - match buf_in.read_until(newline_char, &mut buffer) { - Ok(n) if n == 0 => break, - Err(e) => { - if buffer.is_empty() { - crash!(1, "read error: {}", e); - } - } - _ => (), - } - - let line = &buffer[..]; + let result = buf_in.for_byte_record_with_terminator(newline_char, |line| { let mut fields_pos = 1; let mut low_idx = 0; let mut delim_search = Searcher::new(line, delim.as_bytes()).peekable(); @@ -237,46 +197,46 @@ fn cut_fields_delimiter( if delim_search.peek().is_none() { if !only_delimited { - crash_if_err!(1, out.write_all(line)); + out.write_all(line)?; if line[line.len() - 1] != newline_char { - crash_if_err!(1, out.write_all(&[newline_char])); + out.write_all(&[newline_char])?; } } - continue; + return Ok(true); } - for &Range { low, high } in ranges.iter() { + for &Range { low, high } in ranges { if low - fields_pos > 0 { low_idx = match delim_search.nth(low - fields_pos - 1) { - Some((_, beyond_delim)) => beyond_delim, + Some(index) => index + input_delim_len, None => break, }; } for _ in 0..=high - low { if print_delim { - crash_if_err!(1, out.write_all(out_delim.as_bytes())); + out.write_all(out_delim.as_bytes())?; + } else { + print_delim = true; } match delim_search.next() { - Some((high_idx, next_low_idx)) => { + Some(high_idx) => { let segment = &line[low_idx..high_idx]; - crash_if_err!(1, out.write_all(segment)); + out.write_all(segment)?; - print_delim = true; - - low_idx = next_low_idx; + low_idx = high_idx + input_delim_len; fields_pos = high + 1; } None => { let segment = &line[low_idx..]; - crash_if_err!(1, out.write_all(segment)); + out.write_all(segment)?; if line[line.len() - 1] == newline_char { - continue 'newline; + return Ok(true); } break; } @@ -284,9 +244,10 @@ fn cut_fields_delimiter( } } - crash_if_err!(1, out.write_all(&[newline_char])); - } - + out.write_all(&[newline_char])?; + Ok(true) + }); + crash_if_err!(1, result); 0 } @@ -304,23 +265,11 @@ fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32 ); } - let mut buf_in = BufReader::new(reader); - let mut out = stdout(); - let mut buffer = Vec::new(); + let buf_in = BufReader::new(reader); + let mut out = stdout_writer(); + let delim_len = opts.delimiter.len(); - 'newline: loop { - buffer.clear(); - match buf_in.read_until(newline_char, &mut buffer) { - Ok(n) if n == 0 => break, - Err(e) => { - if buffer.is_empty() { - crash!(1, "read error: {}", e); - } - } - _ => (), - } - - let line = &buffer[..]; + let result = buf_in.for_byte_record_with_terminator(newline_char, |line| { let mut fields_pos = 1; let mut low_idx = 0; let mut delim_search = Searcher::new(line, opts.delimiter.as_bytes()).peekable(); @@ -328,53 +277,54 @@ fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32 if delim_search.peek().is_none() { if !opts.only_delimited { - crash_if_err!(1, out.write_all(line)); + out.write_all(line)?; if line[line.len() - 1] != newline_char { - crash_if_err!(1, out.write_all(&[newline_char])); + out.write_all(&[newline_char])?; } } - continue; + return Ok(true); } - for &Range { low, high } in ranges.iter() { + for &Range { low, high } in ranges { if low - fields_pos > 0 { - low_idx = match delim_search.nth(low - fields_pos - 1) { - Some((_, beyond_delim)) => beyond_delim, - None => break, - }; - } - - if print_delim && low_idx >= opts.delimiter.as_bytes().len() { - low_idx -= opts.delimiter.as_bytes().len(); + if let Some(delim_pos) = delim_search.nth(low - fields_pos - 1) { + low_idx = if print_delim { + delim_pos + } else { + delim_pos + delim_len + } + } else { + break; + } } match delim_search.nth(high - low) { - Some((high_idx, next_low_idx)) => { + Some(high_idx) => { let segment = &line[low_idx..high_idx]; - crash_if_err!(1, out.write_all(segment)); + out.write_all(segment)?; print_delim = true; - low_idx = next_low_idx; + low_idx = high_idx; fields_pos = high + 1; } None => { let segment = &line[low_idx..line.len()]; - crash_if_err!(1, out.write_all(segment)); + out.write_all(segment)?; if line[line.len() - 1] == newline_char { - continue 'newline; + return Ok(true); } break; } } } - - crash_if_err!(1, out.write_all(&[newline_char])); - } - + out.write_all(&[newline_char])?; + Ok(true) + }); + crash_if_err!(1, result); 0 } diff --git a/src/uu/cut/src/searcher.rs b/src/uu/cut/src/searcher.rs index f0821ff3a..5fe4a723b 100644 --- a/src/uu/cut/src/searcher.rs +++ b/src/uu/cut/src/searcher.rs @@ -5,7 +5,8 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -#[derive(Clone)] +use memchr::memchr; + pub struct Searcher<'a> { haystack: &'a [u8], needle: &'a [u8], @@ -14,6 +15,7 @@ pub struct Searcher<'a> { impl<'a> Searcher<'a> { pub fn new(haystack: &'a [u8], needle: &'a [u8]) -> Searcher<'a> { + assert!(!needle.is_empty()); Searcher { haystack, needle, @@ -23,30 +25,81 @@ impl<'a> Searcher<'a> { } impl<'a> Iterator for Searcher<'a> { - type Item = (usize, usize); + type Item = usize; - fn next(&mut self) -> Option<(usize, usize)> { - if self.needle.len() == 1 { - for offset in self.position..self.haystack.len() { - if self.haystack[offset] == self.needle[0] { - self.position = offset + 1; - return Some((offset, offset + 1)); + fn next(&mut self) -> Option { + loop { + if let Some(match_idx) = memchr(self.needle[0], self.haystack) { + if self.needle.len() == 1 + || self.haystack[match_idx + 1..].starts_with(&self.needle[1..]) + { + let match_pos = self.position + match_idx; + let skip = match_idx + self.needle.len(); + self.haystack = &self.haystack[skip..]; + self.position += skip; + return Some(match_pos); + } else { + let skip = match_idx + 1; + self.haystack = &self.haystack[skip..]; + self.position += skip; + // continue } - } - - self.position = self.haystack.len(); - return None; - } - - while self.position + self.needle.len() <= self.haystack.len() { - if &self.haystack[self.position..self.position + self.needle.len()] == self.needle { - let match_pos = self.position; - self.position += self.needle.len(); - return Some((match_pos, match_pos + self.needle.len())); } else { - self.position += 1; + return None; } } - None + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + const NEEDLE: &[u8] = "ab".as_bytes(); + + #[test] + fn test_normal() { + let iter = Searcher::new("a.a.a".as_bytes(), "a".as_bytes()); + let items: Vec = iter.collect(); + assert_eq!(vec![0, 2, 4], items); + } + + #[test] + fn test_empty() { + let iter = Searcher::new("".as_bytes(), "a".as_bytes()); + let items: Vec = iter.collect(); + assert_eq!(vec![] as Vec, items); + } + + fn test_multibyte(line: &[u8], expected: Vec) { + let iter = Searcher::new(line, NEEDLE); + let items: Vec = iter.collect(); + assert_eq!(expected, items); + } + + #[test] + fn test_multibyte_normal() { + test_multibyte("...ab...ab...".as_bytes(), vec![3, 8]); + } + + #[test] + fn test_multibyte_needle_head_at_end() { + test_multibyte("a".as_bytes(), vec![]); + } + + #[test] + fn test_multibyte_starting_needle() { + test_multibyte("ab...ab...".as_bytes(), vec![0, 5]); + } + + #[test] + fn test_multibyte_trailing_needle() { + test_multibyte("...ab...ab".as_bytes(), vec![3, 8]); + } + + #[test] + fn test_multibyte_first_byte_false_match() { + test_multibyte("aA..aCaC..ab..aD".as_bytes(), vec![10]); } }