1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-29 12:07:46 +00:00

wc: move counting code into WordCount::from_line()

Refactor the counting code from the inner loop of the `wc` program
into the `WordCount::from_line()` associated function. This commit
also splits that function up into other helper functions that
encapsulate decoding characters and finding word boundaries from raw
bytes.

This commit also implements the `Sum` trait for the `WordCount`
struct, so that we can simply call `sum()` on an iterator that yields
`WordCount` instances.
This commit is contained in:
Jeffrey Finkelstein 2021-05-04 22:13:28 -04:00 committed by Sylvestre Ledru
parent 50f4941d49
commit ba8f4ea670
2 changed files with 95 additions and 62 deletions

View file

@ -24,7 +24,6 @@ use std::cmp::max;
use std::fs::File;
use std::io::{self, Write};
use std::path::Path;
use std::str::from_utf8;
#[derive(Error, Debug)]
pub enum WcError {
@ -163,18 +162,6 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
}
}
const CR: u8 = b'\r';
const LF: u8 = b'\n';
const SPACE: u8 = b' ';
const TAB: u8 = b'\t';
const SYN: u8 = 0x16_u8;
const FF: u8 = 0x0C_u8;
#[inline(always)]
fn is_word_separator(byte: u8) -> bool {
byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
}
fn word_count_from_reader<T: WordCountable>(
mut reader: T,
settings: &Settings,
@ -195,58 +182,20 @@ fn word_count_from_reader<T: WordCountable>(
// we do not need to decode the byte stream if we're only counting bytes/newlines
let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length;
let mut line_count: usize = 0;
let mut word_count: usize = 0;
let mut byte_count: usize = 0;
let mut char_count: usize = 0;
let mut longest_line_length: usize = 0;
let mut ends_lf: bool;
// reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
// hence the option wrapped in a result here
for line_result in reader.lines() {
let raw_line = match line_result {
Ok(l) => l,
// Sum the WordCount for each line. Show a warning for each line
// that results in an IO error when trying to read it.
let total = reader
.lines()
.filter_map(|res| match res {
Ok(line) => Some(line),
Err(e) => {
show_warning!("Error while reading {}: {}", path, e);
continue;
None
}
};
// GNU 'wc' only counts lines that end in LF as lines
ends_lf = *raw_line.last().unwrap() == LF;
line_count += ends_lf as usize;
byte_count += raw_line.len();
if decode_chars {
// try and convert the bytes to UTF-8 first
let current_char_count;
match from_utf8(&raw_line[..]) {
Ok(line) => {
word_count += line.split_whitespace().count();
current_char_count = line.chars().count();
}
Err(..) => {
word_count += raw_line.split(|&x| is_word_separator(x)).count();
current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count()
}
}
char_count += current_char_count;
if current_char_count > longest_line_length {
// -L is a GNU 'wc' extension so same behavior on LF
longest_line_length = current_char_count - (ends_lf as usize);
}
}
}
Ok(WordCount {
bytes: byte_count,
chars: char_count,
lines: line_count,
words: word_count,
max_line_length: longest_line_length,
})
})
.map(|line| WordCount::from_line(&line, decode_chars))
.sum();
Ok(total)
}
fn word_count_from_path(path: &str, settings: &Settings) -> WcResult<WordCount> {

View file

@ -1,5 +1,19 @@
use std::cmp::max;
use std::iter::Sum;
use std::ops::{Add, AddAssign};
use std::str::from_utf8;
const CR: u8 = b'\r';
const LF: u8 = b'\n';
const SPACE: u8 = b' ';
const TAB: u8 = b'\t';
const SYN: u8 = 0x16_u8;
const FF: u8 = 0x0C_u8;
#[inline(always)]
fn is_word_separator(byte: u8) -> bool {
byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
}
#[derive(Debug, Default, Copy, Clone)]
pub struct WordCount {
@ -30,10 +44,80 @@ impl AddAssign for WordCount {
}
}
impl Sum for WordCount {
fn sum<I>(iter: I) -> WordCount
where
I: Iterator<Item = WordCount>,
{
iter.fold(WordCount::default(), |acc, x| acc + x)
}
}
impl WordCount {
/// Count the characters and whitespace-separated words in the given bytes.
///
/// `line` is a slice of bytes that will be decoded as ASCII characters.
fn ascii_word_and_char_count(line: &[u8]) -> (usize, usize) {
let word_count = line.split(|&x| is_word_separator(x)).count();
let char_count = line.iter().filter(|c| c.is_ascii()).count();
(word_count, char_count)
}
/// Create a [`WordCount`] from a sequence of bytes representing a line.
///
/// If the last byte of `line` encodes a newline character (`\n`),
/// then the [`lines`] field will be set to 1. Otherwise, it will
/// be set to 0. The [`bytes`] field is simply the length of
/// `line`.
///
/// If `decode_chars` is `false`, the [`chars`] and [`words`]
/// fields will be set to 0. If it is `true`, this function will
/// attempt to decode the bytes first as UTF-8, and failing that,
/// as ASCII.
pub fn from_line(line: &[u8], decode_chars: bool) -> WordCount {
// GNU 'wc' only counts lines that end in LF as lines
let lines = (*line.last().unwrap() == LF) as usize;
let bytes = line.len();
let (words, chars) = if decode_chars {
WordCount::word_and_char_count(line)
} else {
(0, 0)
};
// -L is a GNU 'wc' extension so same behavior on LF
let max_line_length = if chars > 0 { chars - lines } else { 0 };
WordCount {
bytes,
chars,
lines,
words,
max_line_length,
}
}
/// Count the UTF-8 characters and words in the given string slice.
///
/// `s` is a string slice that is assumed to be a UTF-8 string.
fn utf8_word_and_char_count(s: &str) -> (usize, usize) {
let word_count = s.split_whitespace().count();
let char_count = s.chars().count();
(word_count, char_count)
}
pub fn with_title(self, title: &str) -> TitledWordCount {
TitledWordCount { title, count: self }
}
/// Count the characters and words in the given slice of bytes.
///
/// `line` is a slice of bytes that will be decoded as UTF-8
/// characters, or if that fails, as ASCII characters.
fn word_and_char_count(line: &[u8]) -> (usize, usize) {
// try and convert the bytes to UTF-8 first
match from_utf8(line) {
Ok(s) => WordCount::utf8_word_and_char_count(s),
Err(..) => WordCount::ascii_word_and_char_count(line),
}
}
}
/// This struct supplements the actual word count with a title that is displayed