mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 11:37:44 +00:00
Implement wc fast paths that skip Unicode decoding.
Byte, character, and line counting can all be done on the raw bytes of the incoming stream without decoding the Unicode characters. This fact was previously exploited in specific fast paths for counting characters and counting lines. This change unifies those fast paths into a single shared fast paths, using const generics to specialize the function for each use case. This has the benefit of making sure that all combinations of these Unicode-oblivious fast paths benefit from the same optimization. On my laptop, this speeds up `wc -clm odyssey1024.txt` from 840ms to 120ms. I experimented with using a filter loop for line counting, but continuing to use the bytecount crate came out ahead by a significant margin.
This commit is contained in:
parent
ec9130a4d7
commit
d5f59f23fa
2 changed files with 44 additions and 36 deletions
|
@ -117,32 +117,21 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> (usize, Opti
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn count_bytes_and_lines_fast<R: Read>(
|
/// Returns a WordCount that counts the number of bytes, lines, and/or the number of Unicode characters encoded in UTF-8 read via a Reader.
|
||||||
handle: &mut R,
|
|
||||||
) -> (WordCount, Option<io::Error>) {
|
|
||||||
let mut total = WordCount::default();
|
|
||||||
let mut buf = [0; BUF_SIZE];
|
|
||||||
loop {
|
|
||||||
match handle.read(&mut buf) {
|
|
||||||
Ok(0) => return (total, None),
|
|
||||||
Ok(n) => {
|
|
||||||
total.bytes += n;
|
|
||||||
total.lines += bytecount::count(&buf[..n], b'\n');
|
|
||||||
}
|
|
||||||
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
|
|
||||||
Err(e) => return (total, Some(e)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns a WordCount that counts the number of Unicode characters encoded in UTF-8 read via a Reader.
|
|
||||||
///
|
///
|
||||||
/// This corresponds to the `-m` command line flag to wc.
|
/// This corresponds to the `-c`, `-l` and `-m` command line flags to wc.
|
||||||
///
|
///
|
||||||
/// # Arguments
|
/// # Arguments
|
||||||
///
|
///
|
||||||
/// * `R` - A Reader from which the UTF-8 stream will be read.
|
/// * `R` - A Reader from which the UTF-8 stream will be read.
|
||||||
pub(crate) fn count_chars_fast<R: Read>(handle: &mut R) -> (WordCount, Option<io::Error>) {
|
pub(crate) fn count_bytes_chars_and_lines_fast<
|
||||||
|
R: Read,
|
||||||
|
const COUNT_BYTES: bool,
|
||||||
|
const COUNT_CHARS: bool,
|
||||||
|
const COUNT_LINES: bool,
|
||||||
|
>(
|
||||||
|
handle: &mut R,
|
||||||
|
) -> (WordCount, Option<io::Error>) {
|
||||||
/// Mask of the value bits of a continuation byte
|
/// Mask of the value bits of a continuation byte
|
||||||
const CONT_MASK: u8 = 0b0011_1111u8;
|
const CONT_MASK: u8 = 0b0011_1111u8;
|
||||||
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
|
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
|
||||||
|
@ -154,10 +143,18 @@ pub(crate) fn count_chars_fast<R: Read>(handle: &mut R) -> (WordCount, Option<io
|
||||||
match handle.read(&mut buf) {
|
match handle.read(&mut buf) {
|
||||||
Ok(0) => return (total, None),
|
Ok(0) => return (total, None),
|
||||||
Ok(n) => {
|
Ok(n) => {
|
||||||
total.chars += buf[..n]
|
if COUNT_BYTES {
|
||||||
.iter()
|
total.bytes += n;
|
||||||
.filter(|&&byte| (byte & !CONT_MASK) != TAG_CONT_U8)
|
}
|
||||||
.count();
|
if COUNT_CHARS {
|
||||||
|
total.chars += buf[..n]
|
||||||
|
.iter()
|
||||||
|
.filter(|&&byte| (byte & !CONT_MASK) != TAG_CONT_U8)
|
||||||
|
.count();
|
||||||
|
}
|
||||||
|
if COUNT_LINES {
|
||||||
|
total.lines += bytecount::count(&buf[..n], b'\n');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
|
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
|
||||||
Err(e) => return (total, Some(e)),
|
Err(e) => return (total, Some(e)),
|
||||||
|
|
|
@ -13,7 +13,7 @@ extern crate uucore;
|
||||||
mod count_fast;
|
mod count_fast;
|
||||||
mod countable;
|
mod countable;
|
||||||
mod word_count;
|
mod word_count;
|
||||||
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast, count_chars_fast};
|
use count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast};
|
||||||
use countable::WordCountable;
|
use countable::WordCountable;
|
||||||
use unicode_width::UnicodeWidthChar;
|
use unicode_width::UnicodeWidthChar;
|
||||||
use utf8::{BufReadDecoder, BufReadDecoderError};
|
use utf8::{BufReadDecoder, BufReadDecoderError};
|
||||||
|
@ -315,7 +315,7 @@ fn word_count_from_reader<T: WordCountable>(
|
||||||
) {
|
) {
|
||||||
// Specialize scanning loop to improve the performance.
|
// Specialize scanning loop to improve the performance.
|
||||||
(false, false, false, false, false) => unreachable!(),
|
(false, false, false, false, false) => unreachable!(),
|
||||||
(false, true, false, false, false) => count_chars_fast(&mut reader),
|
|
||||||
(true, false, false, false, false) => {
|
(true, false, false, false, false) => {
|
||||||
// Fast path when only show_bytes is true.
|
// Fast path when only show_bytes is true.
|
||||||
let (bytes, error) = count_bytes_fast(&mut reader);
|
let (bytes, error) = count_bytes_fast(&mut reader);
|
||||||
|
@ -327,10 +327,27 @@ fn word_count_from_reader<T: WordCountable>(
|
||||||
error,
|
error,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
(false, false, true, false, false) | (true, false, true, false, false) => {
|
|
||||||
// Fast path when only (show_bytes || show_lines) is true.
|
// Fast paths that can be computed without Unicode decoding.
|
||||||
count_bytes_and_lines_fast(&mut reader)
|
(false, false, true, false, false) => {
|
||||||
|
count_bytes_chars_and_lines_fast::<_, false, false, true>(&mut reader)
|
||||||
}
|
}
|
||||||
|
(false, true, false, false, false) => {
|
||||||
|
count_bytes_chars_and_lines_fast::<_, false, true, false>(&mut reader)
|
||||||
|
}
|
||||||
|
(false, true, true, false, false) => {
|
||||||
|
count_bytes_chars_and_lines_fast::<_, false, true, true>(&mut reader)
|
||||||
|
}
|
||||||
|
(true, false, true, false, false) => {
|
||||||
|
count_bytes_chars_and_lines_fast::<_, true, false, true>(&mut reader)
|
||||||
|
}
|
||||||
|
(true, true, false, false, false) => {
|
||||||
|
count_bytes_chars_and_lines_fast::<_, true, true, false>(&mut reader)
|
||||||
|
}
|
||||||
|
(true, true, true, false, false) => {
|
||||||
|
count_bytes_chars_and_lines_fast::<_, true, true, true>(&mut reader)
|
||||||
|
}
|
||||||
|
|
||||||
(_, false, false, false, true) => {
|
(_, false, false, false, true) => {
|
||||||
word_count_from_reader_specialized::<_, false, false, false, true>(reader)
|
word_count_from_reader_specialized::<_, false, false, false, true>(reader)
|
||||||
}
|
}
|
||||||
|
@ -349,9 +366,6 @@ fn word_count_from_reader<T: WordCountable>(
|
||||||
(_, false, true, true, true) => {
|
(_, false, true, true, true) => {
|
||||||
word_count_from_reader_specialized::<_, false, true, true, true>(reader)
|
word_count_from_reader_specialized::<_, false, true, true, true>(reader)
|
||||||
}
|
}
|
||||||
(_, true, false, false, false) => {
|
|
||||||
word_count_from_reader_specialized::<_, true, false, false, false>(reader)
|
|
||||||
}
|
|
||||||
(_, true, false, false, true) => {
|
(_, true, false, false, true) => {
|
||||||
word_count_from_reader_specialized::<_, true, false, false, true>(reader)
|
word_count_from_reader_specialized::<_, true, false, false, true>(reader)
|
||||||
}
|
}
|
||||||
|
@ -361,9 +375,6 @@ fn word_count_from_reader<T: WordCountable>(
|
||||||
(_, true, false, true, true) => {
|
(_, true, false, true, true) => {
|
||||||
word_count_from_reader_specialized::<_, true, false, true, true>(reader)
|
word_count_from_reader_specialized::<_, true, false, true, true>(reader)
|
||||||
}
|
}
|
||||||
(_, true, true, false, false) => {
|
|
||||||
word_count_from_reader_specialized::<_, true, true, false, false>(reader)
|
|
||||||
}
|
|
||||||
(_, true, true, false, true) => {
|
(_, true, true, false, true) => {
|
||||||
word_count_from_reader_specialized::<_, true, true, false, true>(reader)
|
word_count_from_reader_specialized::<_, true, true, false, true>(reader)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue