mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-29 12:07:46 +00:00
wc: move counting code into WordCount::from_line()
Refactor the counting code from the inner loop of the `wc` program into the `WordCount::from_line()` associated function. This commit also splits that function up into other helper functions that encapsulate decoding characters and finding word boundaries from raw bytes. This commit also implements the `Sum` trait for the `WordCount` struct, so that we can simply call `sum()` on an iterator that yields `WordCount` instances.
This commit is contained in:
parent
50f4941d49
commit
ba8f4ea670
2 changed files with 95 additions and 62 deletions
|
@ -24,7 +24,6 @@ use std::cmp::max;
|
|||
use std::fs::File;
|
||||
use std::io::{self, Write};
|
||||
use std::path::Path;
|
||||
use std::str::from_utf8;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum WcError {
|
||||
|
@ -163,18 +162,6 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
|||
}
|
||||
}
|
||||
|
||||
const CR: u8 = b'\r';
|
||||
const LF: u8 = b'\n';
|
||||
const SPACE: u8 = b' ';
|
||||
const TAB: u8 = b'\t';
|
||||
const SYN: u8 = 0x16_u8;
|
||||
const FF: u8 = 0x0C_u8;
|
||||
|
||||
#[inline(always)]
|
||||
fn is_word_separator(byte: u8) -> bool {
|
||||
byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
|
||||
}
|
||||
|
||||
fn word_count_from_reader<T: WordCountable>(
|
||||
mut reader: T,
|
||||
settings: &Settings,
|
||||
|
@ -195,58 +182,20 @@ fn word_count_from_reader<T: WordCountable>(
|
|||
// we do not need to decode the byte stream if we're only counting bytes/newlines
|
||||
let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length;
|
||||
|
||||
let mut line_count: usize = 0;
|
||||
let mut word_count: usize = 0;
|
||||
let mut byte_count: usize = 0;
|
||||
let mut char_count: usize = 0;
|
||||
let mut longest_line_length: usize = 0;
|
||||
let mut ends_lf: bool;
|
||||
|
||||
// reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
|
||||
// hence the option wrapped in a result here
|
||||
for line_result in reader.lines() {
|
||||
let raw_line = match line_result {
|
||||
Ok(l) => l,
|
||||
// Sum the WordCount for each line. Show a warning for each line
|
||||
// that results in an IO error when trying to read it.
|
||||
let total = reader
|
||||
.lines()
|
||||
.filter_map(|res| match res {
|
||||
Ok(line) => Some(line),
|
||||
Err(e) => {
|
||||
show_warning!("Error while reading {}: {}", path, e);
|
||||
continue;
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
// GNU 'wc' only counts lines that end in LF as lines
|
||||
ends_lf = *raw_line.last().unwrap() == LF;
|
||||
line_count += ends_lf as usize;
|
||||
|
||||
byte_count += raw_line.len();
|
||||
|
||||
if decode_chars {
|
||||
// try and convert the bytes to UTF-8 first
|
||||
let current_char_count;
|
||||
match from_utf8(&raw_line[..]) {
|
||||
Ok(line) => {
|
||||
word_count += line.split_whitespace().count();
|
||||
current_char_count = line.chars().count();
|
||||
}
|
||||
Err(..) => {
|
||||
word_count += raw_line.split(|&x| is_word_separator(x)).count();
|
||||
current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count()
|
||||
}
|
||||
}
|
||||
char_count += current_char_count;
|
||||
if current_char_count > longest_line_length {
|
||||
// -L is a GNU 'wc' extension so same behavior on LF
|
||||
longest_line_length = current_char_count - (ends_lf as usize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(WordCount {
|
||||
bytes: byte_count,
|
||||
chars: char_count,
|
||||
lines: line_count,
|
||||
words: word_count,
|
||||
max_line_length: longest_line_length,
|
||||
})
|
||||
.map(|line| WordCount::from_line(&line, decode_chars))
|
||||
.sum();
|
||||
Ok(total)
|
||||
}
|
||||
|
||||
fn word_count_from_path(path: &str, settings: &Settings) -> WcResult<WordCount> {
|
||||
|
|
|
@ -1,5 +1,19 @@
|
|||
use std::cmp::max;
|
||||
use std::iter::Sum;
|
||||
use std::ops::{Add, AddAssign};
|
||||
use std::str::from_utf8;
|
||||
|
||||
const CR: u8 = b'\r';
|
||||
const LF: u8 = b'\n';
|
||||
const SPACE: u8 = b' ';
|
||||
const TAB: u8 = b'\t';
|
||||
const SYN: u8 = 0x16_u8;
|
||||
const FF: u8 = 0x0C_u8;
|
||||
|
||||
#[inline(always)]
|
||||
fn is_word_separator(byte: u8) -> bool {
|
||||
byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Copy, Clone)]
|
||||
pub struct WordCount {
|
||||
|
@ -30,10 +44,80 @@ impl AddAssign for WordCount {
|
|||
}
|
||||
}
|
||||
|
||||
impl Sum for WordCount {
|
||||
fn sum<I>(iter: I) -> WordCount
|
||||
where
|
||||
I: Iterator<Item = WordCount>,
|
||||
{
|
||||
iter.fold(WordCount::default(), |acc, x| acc + x)
|
||||
}
|
||||
}
|
||||
|
||||
impl WordCount {
|
||||
/// Count the characters and whitespace-separated words in the given bytes.
|
||||
///
|
||||
/// `line` is a slice of bytes that will be decoded as ASCII characters.
|
||||
fn ascii_word_and_char_count(line: &[u8]) -> (usize, usize) {
|
||||
let word_count = line.split(|&x| is_word_separator(x)).count();
|
||||
let char_count = line.iter().filter(|c| c.is_ascii()).count();
|
||||
(word_count, char_count)
|
||||
}
|
||||
|
||||
/// Create a [`WordCount`] from a sequence of bytes representing a line.
|
||||
///
|
||||
/// If the last byte of `line` encodes a newline character (`\n`),
|
||||
/// then the [`lines`] field will be set to 1. Otherwise, it will
|
||||
/// be set to 0. The [`bytes`] field is simply the length of
|
||||
/// `line`.
|
||||
///
|
||||
/// If `decode_chars` is `false`, the [`chars`] and [`words`]
|
||||
/// fields will be set to 0. If it is `true`, this function will
|
||||
/// attempt to decode the bytes first as UTF-8, and failing that,
|
||||
/// as ASCII.
|
||||
pub fn from_line(line: &[u8], decode_chars: bool) -> WordCount {
|
||||
// GNU 'wc' only counts lines that end in LF as lines
|
||||
let lines = (*line.last().unwrap() == LF) as usize;
|
||||
let bytes = line.len();
|
||||
let (words, chars) = if decode_chars {
|
||||
WordCount::word_and_char_count(line)
|
||||
} else {
|
||||
(0, 0)
|
||||
};
|
||||
// -L is a GNU 'wc' extension so same behavior on LF
|
||||
let max_line_length = if chars > 0 { chars - lines } else { 0 };
|
||||
WordCount {
|
||||
bytes,
|
||||
chars,
|
||||
lines,
|
||||
words,
|
||||
max_line_length,
|
||||
}
|
||||
}
|
||||
|
||||
/// Count the UTF-8 characters and words in the given string slice.
|
||||
///
|
||||
/// `s` is a string slice that is assumed to be a UTF-8 string.
|
||||
fn utf8_word_and_char_count(s: &str) -> (usize, usize) {
|
||||
let word_count = s.split_whitespace().count();
|
||||
let char_count = s.chars().count();
|
||||
(word_count, char_count)
|
||||
}
|
||||
|
||||
pub fn with_title(self, title: &str) -> TitledWordCount {
|
||||
TitledWordCount { title, count: self }
|
||||
}
|
||||
|
||||
/// Count the characters and words in the given slice of bytes.
|
||||
///
|
||||
/// `line` is a slice of bytes that will be decoded as UTF-8
|
||||
/// characters, or if that fails, as ASCII characters.
|
||||
fn word_and_char_count(line: &[u8]) -> (usize, usize) {
|
||||
// try and convert the bytes to UTF-8 first
|
||||
match from_utf8(line) {
|
||||
Ok(s) => WordCount::utf8_word_and_char_count(s),
|
||||
Err(..) => WordCount::ascii_word_and_char_count(line),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This struct supplements the actual word count with a title that is displayed
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue