mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-29 03:57:44 +00:00
wc: move counting code into WordCount::from_line()
Refactor the counting code from the inner loop of the `wc` program into the `WordCount::from_line()` associated function. This commit also splits that function up into other helper functions that encapsulate decoding characters and finding word boundaries from raw bytes. This commit also implements the `Sum` trait for the `WordCount` struct, so that we can simply call `sum()` on an iterator that yields `WordCount` instances.
This commit is contained in:
parent
50f4941d49
commit
ba8f4ea670
2 changed files with 95 additions and 62 deletions
|
@ -24,7 +24,6 @@ use std::cmp::max;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, Write};
|
use std::io::{self, Write};
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::str::from_utf8;
|
|
||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub enum WcError {
|
pub enum WcError {
|
||||||
|
@ -163,18 +162,6 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const CR: u8 = b'\r';
|
|
||||||
const LF: u8 = b'\n';
|
|
||||||
const SPACE: u8 = b' ';
|
|
||||||
const TAB: u8 = b'\t';
|
|
||||||
const SYN: u8 = 0x16_u8;
|
|
||||||
const FF: u8 = 0x0C_u8;
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn is_word_separator(byte: u8) -> bool {
|
|
||||||
byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
|
|
||||||
}
|
|
||||||
|
|
||||||
fn word_count_from_reader<T: WordCountable>(
|
fn word_count_from_reader<T: WordCountable>(
|
||||||
mut reader: T,
|
mut reader: T,
|
||||||
settings: &Settings,
|
settings: &Settings,
|
||||||
|
@ -195,58 +182,20 @@ fn word_count_from_reader<T: WordCountable>(
|
||||||
// we do not need to decode the byte stream if we're only counting bytes/newlines
|
// we do not need to decode the byte stream if we're only counting bytes/newlines
|
||||||
let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length;
|
let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length;
|
||||||
|
|
||||||
let mut line_count: usize = 0;
|
// Sum the WordCount for each line. Show a warning for each line
|
||||||
let mut word_count: usize = 0;
|
// that results in an IO error when trying to read it.
|
||||||
let mut byte_count: usize = 0;
|
let total = reader
|
||||||
let mut char_count: usize = 0;
|
.lines()
|
||||||
let mut longest_line_length: usize = 0;
|
.filter_map(|res| match res {
|
||||||
let mut ends_lf: bool;
|
Ok(line) => Some(line),
|
||||||
|
|
||||||
// reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
|
|
||||||
// hence the option wrapped in a result here
|
|
||||||
for line_result in reader.lines() {
|
|
||||||
let raw_line = match line_result {
|
|
||||||
Ok(l) => l,
|
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
show_warning!("Error while reading {}: {}", path, e);
|
show_warning!("Error while reading {}: {}", path, e);
|
||||||
continue;
|
None
|
||||||
}
|
}
|
||||||
};
|
})
|
||||||
|
.map(|line| WordCount::from_line(&line, decode_chars))
|
||||||
// GNU 'wc' only counts lines that end in LF as lines
|
.sum();
|
||||||
ends_lf = *raw_line.last().unwrap() == LF;
|
Ok(total)
|
||||||
line_count += ends_lf as usize;
|
|
||||||
|
|
||||||
byte_count += raw_line.len();
|
|
||||||
|
|
||||||
if decode_chars {
|
|
||||||
// try and convert the bytes to UTF-8 first
|
|
||||||
let current_char_count;
|
|
||||||
match from_utf8(&raw_line[..]) {
|
|
||||||
Ok(line) => {
|
|
||||||
word_count += line.split_whitespace().count();
|
|
||||||
current_char_count = line.chars().count();
|
|
||||||
}
|
|
||||||
Err(..) => {
|
|
||||||
word_count += raw_line.split(|&x| is_word_separator(x)).count();
|
|
||||||
current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
char_count += current_char_count;
|
|
||||||
if current_char_count > longest_line_length {
|
|
||||||
// -L is a GNU 'wc' extension so same behavior on LF
|
|
||||||
longest_line_length = current_char_count - (ends_lf as usize);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(WordCount {
|
|
||||||
bytes: byte_count,
|
|
||||||
chars: char_count,
|
|
||||||
lines: line_count,
|
|
||||||
words: word_count,
|
|
||||||
max_line_length: longest_line_length,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn word_count_from_path(path: &str, settings: &Settings) -> WcResult<WordCount> {
|
fn word_count_from_path(path: &str, settings: &Settings) -> WcResult<WordCount> {
|
||||||
|
|
|
@ -1,5 +1,19 @@
|
||||||
use std::cmp::max;
|
use std::cmp::max;
|
||||||
|
use std::iter::Sum;
|
||||||
use std::ops::{Add, AddAssign};
|
use std::ops::{Add, AddAssign};
|
||||||
|
use std::str::from_utf8;
|
||||||
|
|
||||||
|
const CR: u8 = b'\r';
|
||||||
|
const LF: u8 = b'\n';
|
||||||
|
const SPACE: u8 = b' ';
|
||||||
|
const TAB: u8 = b'\t';
|
||||||
|
const SYN: u8 = 0x16_u8;
|
||||||
|
const FF: u8 = 0x0C_u8;
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn is_word_separator(byte: u8) -> bool {
|
||||||
|
byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Default, Copy, Clone)]
|
#[derive(Debug, Default, Copy, Clone)]
|
||||||
pub struct WordCount {
|
pub struct WordCount {
|
||||||
|
@ -30,10 +44,80 @@ impl AddAssign for WordCount {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Sum for WordCount {
|
||||||
|
fn sum<I>(iter: I) -> WordCount
|
||||||
|
where
|
||||||
|
I: Iterator<Item = WordCount>,
|
||||||
|
{
|
||||||
|
iter.fold(WordCount::default(), |acc, x| acc + x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl WordCount {
|
impl WordCount {
|
||||||
|
/// Count the characters and whitespace-separated words in the given bytes.
|
||||||
|
///
|
||||||
|
/// `line` is a slice of bytes that will be decoded as ASCII characters.
|
||||||
|
fn ascii_word_and_char_count(line: &[u8]) -> (usize, usize) {
|
||||||
|
let word_count = line.split(|&x| is_word_separator(x)).count();
|
||||||
|
let char_count = line.iter().filter(|c| c.is_ascii()).count();
|
||||||
|
(word_count, char_count)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a [`WordCount`] from a sequence of bytes representing a line.
|
||||||
|
///
|
||||||
|
/// If the last byte of `line` encodes a newline character (`\n`),
|
||||||
|
/// then the [`lines`] field will be set to 1. Otherwise, it will
|
||||||
|
/// be set to 0. The [`bytes`] field is simply the length of
|
||||||
|
/// `line`.
|
||||||
|
///
|
||||||
|
/// If `decode_chars` is `false`, the [`chars`] and [`words`]
|
||||||
|
/// fields will be set to 0. If it is `true`, this function will
|
||||||
|
/// attempt to decode the bytes first as UTF-8, and failing that,
|
||||||
|
/// as ASCII.
|
||||||
|
pub fn from_line(line: &[u8], decode_chars: bool) -> WordCount {
|
||||||
|
// GNU 'wc' only counts lines that end in LF as lines
|
||||||
|
let lines = (*line.last().unwrap() == LF) as usize;
|
||||||
|
let bytes = line.len();
|
||||||
|
let (words, chars) = if decode_chars {
|
||||||
|
WordCount::word_and_char_count(line)
|
||||||
|
} else {
|
||||||
|
(0, 0)
|
||||||
|
};
|
||||||
|
// -L is a GNU 'wc' extension so same behavior on LF
|
||||||
|
let max_line_length = if chars > 0 { chars - lines } else { 0 };
|
||||||
|
WordCount {
|
||||||
|
bytes,
|
||||||
|
chars,
|
||||||
|
lines,
|
||||||
|
words,
|
||||||
|
max_line_length,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Count the UTF-8 characters and words in the given string slice.
|
||||||
|
///
|
||||||
|
/// `s` is a string slice that is assumed to be a UTF-8 string.
|
||||||
|
fn utf8_word_and_char_count(s: &str) -> (usize, usize) {
|
||||||
|
let word_count = s.split_whitespace().count();
|
||||||
|
let char_count = s.chars().count();
|
||||||
|
(word_count, char_count)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn with_title(self, title: &str) -> TitledWordCount {
|
pub fn with_title(self, title: &str) -> TitledWordCount {
|
||||||
TitledWordCount { title, count: self }
|
TitledWordCount { title, count: self }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Count the characters and words in the given slice of bytes.
|
||||||
|
///
|
||||||
|
/// `line` is a slice of bytes that will be decoded as UTF-8
|
||||||
|
/// characters, or if that fails, as ASCII characters.
|
||||||
|
fn word_and_char_count(line: &[u8]) -> (usize, usize) {
|
||||||
|
// try and convert the bytes to UTF-8 first
|
||||||
|
match from_utf8(line) {
|
||||||
|
Ok(s) => WordCount::utf8_word_and_char_count(s),
|
||||||
|
Err(..) => WordCount::ascii_word_and_char_count(line),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This struct supplements the actual word count with a title that is displayed
|
/// This struct supplements the actual word count with a title that is displayed
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue