mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-29 12:07:46 +00:00
Merge pull request #3740 from resistor/main
Implement wc fast paths that skip Unicode decoding.
This commit is contained in:
commit
2fa4d6a2bb
3 changed files with 71 additions and 36 deletions
|
@ -117,32 +117,21 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> (usize, Opti
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn count_bytes_and_lines_fast<R: Read>(
|
/// Returns a WordCount that counts the number of bytes, lines, and/or the number of Unicode characters encoded in UTF-8 read via a Reader.
|
||||||
handle: &mut R,
|
|
||||||
) -> (WordCount, Option<io::Error>) {
|
|
||||||
let mut total = WordCount::default();
|
|
||||||
let mut buf = [0; BUF_SIZE];
|
|
||||||
loop {
|
|
||||||
match handle.read(&mut buf) {
|
|
||||||
Ok(0) => return (total, None),
|
|
||||||
Ok(n) => {
|
|
||||||
total.bytes += n;
|
|
||||||
total.lines += bytecount::count(&buf[..n], b'\n');
|
|
||||||
}
|
|
||||||
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
|
|
||||||
Err(e) => return (total, Some(e)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns a WordCount that counts the number of Unicode characters encoded in UTF-8 read via a Reader.
|
|
||||||
///
|
///
|
||||||
/// This corresponds to the `-m` command line flag to wc.
|
/// This corresponds to the `-c`, `-l` and `-m` command line flags to wc.
|
||||||
///
|
///
|
||||||
/// # Arguments
|
/// # Arguments
|
||||||
///
|
///
|
||||||
/// * `R` - A Reader from which the UTF-8 stream will be read.
|
/// * `R` - A Reader from which the UTF-8 stream will be read.
|
||||||
pub(crate) fn count_chars_fast<R: Read>(handle: &mut R) -> (WordCount, Option<io::Error>) {
|
pub(crate) fn count_bytes_chars_and_lines_fast<
|
||||||
|
R: Read,
|
||||||
|
const COUNT_BYTES: bool,
|
||||||
|
const COUNT_CHARS: bool,
|
||||||
|
const COUNT_LINES: bool,
|
||||||
|
>(
|
||||||
|
handle: &mut R,
|
||||||
|
) -> (WordCount, Option<io::Error>) {
|
||||||
/// Mask of the value bits of a continuation byte
|
/// Mask of the value bits of a continuation byte
|
||||||
const CONT_MASK: u8 = 0b0011_1111u8;
|
const CONT_MASK: u8 = 0b0011_1111u8;
|
||||||
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
|
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
|
||||||
|
@ -154,11 +143,19 @@ pub(crate) fn count_chars_fast<R: Read>(handle: &mut R) -> (WordCount, Option<io
|
||||||
match handle.read(&mut buf) {
|
match handle.read(&mut buf) {
|
||||||
Ok(0) => return (total, None),
|
Ok(0) => return (total, None),
|
||||||
Ok(n) => {
|
Ok(n) => {
|
||||||
|
if COUNT_BYTES {
|
||||||
|
total.bytes += n;
|
||||||
|
}
|
||||||
|
if COUNT_CHARS {
|
||||||
total.chars += buf[..n]
|
total.chars += buf[..n]
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|&&byte| (byte & !CONT_MASK) != TAG_CONT_U8)
|
.filter(|&&byte| (byte & !CONT_MASK) != TAG_CONT_U8)
|
||||||
.count();
|
.count();
|
||||||
}
|
}
|
||||||
|
if COUNT_LINES {
|
||||||
|
total.lines += bytecount::count(&buf[..n], b'\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
|
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
|
||||||
Err(e) => return (total, Some(e)),
|
Err(e) => return (total, Some(e)),
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,7 +13,7 @@ extern crate uucore;
|
||||||
mod count_fast;
|
mod count_fast;
|
||||||
mod countable;
|
mod countable;
|
||||||
mod word_count;
|
mod word_count;
|
||||||
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast, count_chars_fast};
|
use count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast};
|
||||||
use countable::WordCountable;
|
use countable::WordCountable;
|
||||||
use unicode_width::UnicodeWidthChar;
|
use unicode_width::UnicodeWidthChar;
|
||||||
use utf8::{BufReadDecoder, BufReadDecoderError};
|
use utf8::{BufReadDecoder, BufReadDecoderError};
|
||||||
|
@ -315,7 +315,7 @@ fn word_count_from_reader<T: WordCountable>(
|
||||||
) {
|
) {
|
||||||
// Specialize scanning loop to improve the performance.
|
// Specialize scanning loop to improve the performance.
|
||||||
(false, false, false, false, false) => unreachable!(),
|
(false, false, false, false, false) => unreachable!(),
|
||||||
(false, true, false, false, false) => count_chars_fast(&mut reader),
|
|
||||||
(true, false, false, false, false) => {
|
(true, false, false, false, false) => {
|
||||||
// Fast path when only show_bytes is true.
|
// Fast path when only show_bytes is true.
|
||||||
let (bytes, error) = count_bytes_fast(&mut reader);
|
let (bytes, error) = count_bytes_fast(&mut reader);
|
||||||
|
@ -327,10 +327,27 @@ fn word_count_from_reader<T: WordCountable>(
|
||||||
error,
|
error,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
(false, false, true, false, false) | (true, false, true, false, false) => {
|
|
||||||
// Fast path when only (show_bytes || show_lines) is true.
|
// Fast paths that can be computed without Unicode decoding.
|
||||||
count_bytes_and_lines_fast(&mut reader)
|
(false, false, true, false, false) => {
|
||||||
|
count_bytes_chars_and_lines_fast::<_, false, false, true>(&mut reader)
|
||||||
}
|
}
|
||||||
|
(false, true, false, false, false) => {
|
||||||
|
count_bytes_chars_and_lines_fast::<_, false, true, false>(&mut reader)
|
||||||
|
}
|
||||||
|
(false, true, true, false, false) => {
|
||||||
|
count_bytes_chars_and_lines_fast::<_, false, true, true>(&mut reader)
|
||||||
|
}
|
||||||
|
(true, false, true, false, false) => {
|
||||||
|
count_bytes_chars_and_lines_fast::<_, true, false, true>(&mut reader)
|
||||||
|
}
|
||||||
|
(true, true, false, false, false) => {
|
||||||
|
count_bytes_chars_and_lines_fast::<_, true, true, false>(&mut reader)
|
||||||
|
}
|
||||||
|
(true, true, true, false, false) => {
|
||||||
|
count_bytes_chars_and_lines_fast::<_, true, true, true>(&mut reader)
|
||||||
|
}
|
||||||
|
|
||||||
(_, false, false, false, true) => {
|
(_, false, false, false, true) => {
|
||||||
word_count_from_reader_specialized::<_, false, false, false, true>(reader)
|
word_count_from_reader_specialized::<_, false, false, false, true>(reader)
|
||||||
}
|
}
|
||||||
|
@ -349,9 +366,6 @@ fn word_count_from_reader<T: WordCountable>(
|
||||||
(_, false, true, true, true) => {
|
(_, false, true, true, true) => {
|
||||||
word_count_from_reader_specialized::<_, false, true, true, true>(reader)
|
word_count_from_reader_specialized::<_, false, true, true, true>(reader)
|
||||||
}
|
}
|
||||||
(_, true, false, false, false) => {
|
|
||||||
word_count_from_reader_specialized::<_, true, false, false, false>(reader)
|
|
||||||
}
|
|
||||||
(_, true, false, false, true) => {
|
(_, true, false, false, true) => {
|
||||||
word_count_from_reader_specialized::<_, true, false, false, true>(reader)
|
word_count_from_reader_specialized::<_, true, false, false, true>(reader)
|
||||||
}
|
}
|
||||||
|
@ -361,9 +375,6 @@ fn word_count_from_reader<T: WordCountable>(
|
||||||
(_, true, false, true, true) => {
|
(_, true, false, true, true) => {
|
||||||
word_count_from_reader_specialized::<_, true, false, true, true>(reader)
|
word_count_from_reader_specialized::<_, true, false, true, true>(reader)
|
||||||
}
|
}
|
||||||
(_, true, true, false, false) => {
|
|
||||||
word_count_from_reader_specialized::<_, true, true, false, false>(reader)
|
|
||||||
}
|
|
||||||
(_, true, true, false, true) => {
|
(_, true, true, false, true) => {
|
||||||
word_count_from_reader_specialized::<_, true, true, false, true>(reader)
|
word_count_from_reader_specialized::<_, true, true, false, true>(reader)
|
||||||
}
|
}
|
||||||
|
|
|
@ -104,6 +104,33 @@ fn test_utf8_chars() {
|
||||||
.stdout_is("442\n");
|
.stdout_is("442\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_utf8_bytes_chars() {
|
||||||
|
new_ucmd!()
|
||||||
|
.arg("-cm")
|
||||||
|
.pipe_in_fixture("UTF_8_weirdchars.txt")
|
||||||
|
.run()
|
||||||
|
.stdout_is(" 442 513\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_utf8_bytes_lines() {
|
||||||
|
new_ucmd!()
|
||||||
|
.arg("-cl")
|
||||||
|
.pipe_in_fixture("UTF_8_weirdchars.txt")
|
||||||
|
.run()
|
||||||
|
.stdout_is(" 25 513\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_utf8_bytes_chars_lines() {
|
||||||
|
new_ucmd!()
|
||||||
|
.arg("-cml")
|
||||||
|
.pipe_in_fixture("UTF_8_weirdchars.txt")
|
||||||
|
.run()
|
||||||
|
.stdout_is(" 25 442 513\n");
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_utf8_chars_words() {
|
fn test_utf8_chars_words() {
|
||||||
new_ucmd!()
|
new_ucmd!()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue