mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 11:37:44 +00:00
Implement wc fast paths that skip Unicode decoding.
Byte, character, and line counting can all be done on the raw bytes of the incoming stream without decoding the Unicode characters. This fact was previously exploited in specific fast paths for counting characters and counting lines. This change unifies those fast paths into a single shared fast paths, using const generics to specialize the function for each use case. This has the benefit of making sure that all combinations of these Unicode-oblivious fast paths benefit from the same optimization. On my laptop, this speeds up `wc -clm odyssey1024.txt` from 840ms to 120ms. I experimented with using a filter loop for line counting, but continuing to use the bytecount crate came out ahead by a significant margin.
This commit is contained in:
parent
ec9130a4d7
commit
d5f59f23fa
2 changed files with 44 additions and 36 deletions
|
@ -117,32 +117,21 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> (usize, Opti
|
|||
}
|
||||
}
|
||||
|
||||
pub(crate) fn count_bytes_and_lines_fast<R: Read>(
|
||||
handle: &mut R,
|
||||
) -> (WordCount, Option<io::Error>) {
|
||||
let mut total = WordCount::default();
|
||||
let mut buf = [0; BUF_SIZE];
|
||||
loop {
|
||||
match handle.read(&mut buf) {
|
||||
Ok(0) => return (total, None),
|
||||
Ok(n) => {
|
||||
total.bytes += n;
|
||||
total.lines += bytecount::count(&buf[..n], b'\n');
|
||||
}
|
||||
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
|
||||
Err(e) => return (total, Some(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a WordCount that counts the number of Unicode characters encoded in UTF-8 read via a Reader.
|
||||
/// Returns a WordCount that counts the number of bytes, lines, and/or the number of Unicode characters encoded in UTF-8 read via a Reader.
|
||||
///
|
||||
/// This corresponds to the `-m` command line flag to wc.
|
||||
/// This corresponds to the `-c`, `-l` and `-m` command line flags to wc.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `R` - A Reader from which the UTF-8 stream will be read.
|
||||
pub(crate) fn count_chars_fast<R: Read>(handle: &mut R) -> (WordCount, Option<io::Error>) {
|
||||
pub(crate) fn count_bytes_chars_and_lines_fast<
|
||||
R: Read,
|
||||
const COUNT_BYTES: bool,
|
||||
const COUNT_CHARS: bool,
|
||||
const COUNT_LINES: bool,
|
||||
>(
|
||||
handle: &mut R,
|
||||
) -> (WordCount, Option<io::Error>) {
|
||||
/// Mask of the value bits of a continuation byte
|
||||
const CONT_MASK: u8 = 0b0011_1111u8;
|
||||
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
|
||||
|
@ -154,10 +143,18 @@ pub(crate) fn count_chars_fast<R: Read>(handle: &mut R) -> (WordCount, Option<io
|
|||
match handle.read(&mut buf) {
|
||||
Ok(0) => return (total, None),
|
||||
Ok(n) => {
|
||||
total.chars += buf[..n]
|
||||
.iter()
|
||||
.filter(|&&byte| (byte & !CONT_MASK) != TAG_CONT_U8)
|
||||
.count();
|
||||
if COUNT_BYTES {
|
||||
total.bytes += n;
|
||||
}
|
||||
if COUNT_CHARS {
|
||||
total.chars += buf[..n]
|
||||
.iter()
|
||||
.filter(|&&byte| (byte & !CONT_MASK) != TAG_CONT_U8)
|
||||
.count();
|
||||
}
|
||||
if COUNT_LINES {
|
||||
total.lines += bytecount::count(&buf[..n], b'\n');
|
||||
}
|
||||
}
|
||||
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
|
||||
Err(e) => return (total, Some(e)),
|
||||
|
|
|
@ -13,7 +13,7 @@ extern crate uucore;
|
|||
mod count_fast;
|
||||
mod countable;
|
||||
mod word_count;
|
||||
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast, count_chars_fast};
|
||||
use count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast};
|
||||
use countable::WordCountable;
|
||||
use unicode_width::UnicodeWidthChar;
|
||||
use utf8::{BufReadDecoder, BufReadDecoderError};
|
||||
|
@ -315,7 +315,7 @@ fn word_count_from_reader<T: WordCountable>(
|
|||
) {
|
||||
// Specialize scanning loop to improve the performance.
|
||||
(false, false, false, false, false) => unreachable!(),
|
||||
(false, true, false, false, false) => count_chars_fast(&mut reader),
|
||||
|
||||
(true, false, false, false, false) => {
|
||||
// Fast path when only show_bytes is true.
|
||||
let (bytes, error) = count_bytes_fast(&mut reader);
|
||||
|
@ -327,10 +327,27 @@ fn word_count_from_reader<T: WordCountable>(
|
|||
error,
|
||||
)
|
||||
}
|
||||
(false, false, true, false, false) | (true, false, true, false, false) => {
|
||||
// Fast path when only (show_bytes || show_lines) is true.
|
||||
count_bytes_and_lines_fast(&mut reader)
|
||||
|
||||
// Fast paths that can be computed without Unicode decoding.
|
||||
(false, false, true, false, false) => {
|
||||
count_bytes_chars_and_lines_fast::<_, false, false, true>(&mut reader)
|
||||
}
|
||||
(false, true, false, false, false) => {
|
||||
count_bytes_chars_and_lines_fast::<_, false, true, false>(&mut reader)
|
||||
}
|
||||
(false, true, true, false, false) => {
|
||||
count_bytes_chars_and_lines_fast::<_, false, true, true>(&mut reader)
|
||||
}
|
||||
(true, false, true, false, false) => {
|
||||
count_bytes_chars_and_lines_fast::<_, true, false, true>(&mut reader)
|
||||
}
|
||||
(true, true, false, false, false) => {
|
||||
count_bytes_chars_and_lines_fast::<_, true, true, false>(&mut reader)
|
||||
}
|
||||
(true, true, true, false, false) => {
|
||||
count_bytes_chars_and_lines_fast::<_, true, true, true>(&mut reader)
|
||||
}
|
||||
|
||||
(_, false, false, false, true) => {
|
||||
word_count_from_reader_specialized::<_, false, false, false, true>(reader)
|
||||
}
|
||||
|
@ -349,9 +366,6 @@ fn word_count_from_reader<T: WordCountable>(
|
|||
(_, false, true, true, true) => {
|
||||
word_count_from_reader_specialized::<_, false, true, true, true>(reader)
|
||||
}
|
||||
(_, true, false, false, false) => {
|
||||
word_count_from_reader_specialized::<_, true, false, false, false>(reader)
|
||||
}
|
||||
(_, true, false, false, true) => {
|
||||
word_count_from_reader_specialized::<_, true, false, false, true>(reader)
|
||||
}
|
||||
|
@ -361,9 +375,6 @@ fn word_count_from_reader<T: WordCountable>(
|
|||
(_, true, false, true, true) => {
|
||||
word_count_from_reader_specialized::<_, true, false, true, true>(reader)
|
||||
}
|
||||
(_, true, true, false, false) => {
|
||||
word_count_from_reader_specialized::<_, true, true, false, false>(reader)
|
||||
}
|
||||
(_, true, true, false, true) => {
|
||||
word_count_from_reader_specialized::<_, true, true, false, true>(reader)
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue