wc: move counting code into WordCount::from_line()

Refactor the counting code from the inner loop of the `wc` program into the `WordCount::from_line()` associated function. This commit also splits that function up into other helper functions that encapsulate decoding characters and finding word boundaries from raw bytes. This commit also implements the `Sum` trait for the `WordCount` struct, so that we can simply call `sum()` on an iterator that yields `WordCount` instances.
2025-09-14 02:57:57 +00:00 · 2021-05-04 22:13:28 -04:00 · 2021-05-04 22:13:28 -04:00 · ba8f4ea670
commit ba8f4ea670
parent 50f4941d49
2 changed files with 95 additions and 62 deletions
--- a/src/uu/wc/src/wc.rs
+++ b/src/uu/wc/src/wc.rs
@ -24,7 +24,6 @@ use std::cmp::max;
 use std::fs::File;
 use std::io::{self, Write};
 use std::path::Path;
-use std::str::from_utf8;

 #[derive(Error, Debug)]
 pub enum WcError {
@ -163,18 +162,6 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
    }
 }

-const CR: u8 = b'\r';
-const LF: u8 = b'\n';
-const SPACE: u8 = b' ';
-const TAB: u8 = b'\t';
-const SYN: u8 = 0x16_u8;
-const FF: u8 = 0x0C_u8;
-
-#[inline(always)]
-fn is_word_separator(byte: u8) -> bool {
-    byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
-}
-
 fn word_count_from_reader<T: WordCountable>(
    mut reader: T,
    settings: &Settings,
@ -195,58 +182,20 @@ fn word_count_from_reader<T: WordCountable>(
    // we do not need to decode the byte stream if we're only counting bytes/newlines
    let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length;

-    let mut line_count: usize = 0;
-    let mut word_count: usize = 0;
-    let mut byte_count: usize = 0;
-    let mut char_count: usize = 0;
-    let mut longest_line_length: usize = 0;
-    let mut ends_lf: bool;
-
-    // reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
-    // hence the option wrapped in a result here
-    for line_result in reader.lines() {
-        let raw_line = match line_result {
-            Ok(l) => l,
+    // Sum the WordCount for each line. Show a warning for each line
+    // that results in an IO error when trying to read it.
+    let total = reader
+        .lines()
+        .filter_map(|res| match res {
+            Ok(line) => Some(line),
            Err(e) => {
                show_warning!("Error while reading {}: {}", path, e);
-                continue;
+                None
            }
-        };
-
-        // GNU 'wc' only counts lines that end in LF as lines
-        ends_lf = *raw_line.last().unwrap() == LF;
-        line_count += ends_lf as usize;
-
-        byte_count += raw_line.len();
-
-        if decode_chars {
-            // try and convert the bytes to UTF-8 first
-            let current_char_count;
-            match from_utf8(&raw_line[..]) {
-                Ok(line) => {
-                    word_count += line.split_whitespace().count();
-                    current_char_count = line.chars().count();
-                }
-                Err(..) => {
-                    word_count += raw_line.split(|&x| is_word_separator(x)).count();
-                    current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count()
-                }
-            }
-            char_count += current_char_count;
-            if current_char_count > longest_line_length {
-                // -L is a GNU 'wc' extension so same behavior on LF
-                longest_line_length = current_char_count - (ends_lf as usize);
-            }
-        }
-    }
-
-    Ok(WordCount {
-        bytes: byte_count,
-        chars: char_count,
-        lines: line_count,
-        words: word_count,
-        max_line_length: longest_line_length,
-    })
+        })
+        .map(|line| WordCount::from_line(&line, decode_chars))
+        .sum();
+    Ok(total)
 }

 fn word_count_from_path(path: &str, settings: &Settings) -> WcResult<WordCount> {
--- a/src/uu/wc/src/wordcount.rs
+++ b/src/uu/wc/src/wordcount.rs
@ -1,5 +1,19 @@
 use std::cmp::max;
+use std::iter::Sum;
 use std::ops::{Add, AddAssign};
+use std::str::from_utf8;
+
+const CR: u8 = b'\r';
+const LF: u8 = b'\n';
+const SPACE: u8 = b' ';
+const TAB: u8 = b'\t';
+const SYN: u8 = 0x16_u8;
+const FF: u8 = 0x0C_u8;
+
+#[inline(always)]
+fn is_word_separator(byte: u8) -> bool {
+    byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
+}

 #[derive(Debug, Default, Copy, Clone)]
 pub struct WordCount {
@ -30,10 +44,80 @@ impl AddAssign for WordCount {
    }
 }

+impl Sum for WordCount {
+    fn sum<I>(iter: I) -> WordCount
+    where
+        I: Iterator<Item = WordCount>,
+    {
+        iter.fold(WordCount::default(), |acc, x| acc + x)
+    }
+}
+
 impl WordCount {
+    /// Count the characters and whitespace-separated words in the given bytes.
+    ///
+    /// `line` is a slice of bytes that will be decoded as ASCII characters.
+    fn ascii_word_and_char_count(line: &[u8]) -> (usize, usize) {
+        let word_count = line.split(|&x| is_word_separator(x)).count();
+        let char_count = line.iter().filter(|c| c.is_ascii()).count();
+        (word_count, char_count)
+    }
+
+    /// Create a [`WordCount`] from a sequence of bytes representing a line.
+    ///
+    /// If the last byte of `line` encodes a newline character (`\n`),
+    /// then the [`lines`] field will be set to 1. Otherwise, it will
+    /// be set to 0. The [`bytes`] field is simply the length of
+    /// `line`.
+    ///
+    /// If `decode_chars` is `false`, the [`chars`] and [`words`]
+    /// fields will be set to 0. If it is `true`, this function will
+    /// attempt to decode the bytes first as UTF-8, and failing that,
+    /// as ASCII.
+    pub fn from_line(line: &[u8], decode_chars: bool) -> WordCount {
+        // GNU 'wc' only counts lines that end in LF as lines
+        let lines = (*line.last().unwrap() == LF) as usize;
+        let bytes = line.len();
+        let (words, chars) = if decode_chars {
+            WordCount::word_and_char_count(line)
+        } else {
+            (0, 0)
+        };
+        // -L is a GNU 'wc' extension so same behavior on LF
+        let max_line_length = if chars > 0 { chars - lines } else { 0 };
+        WordCount {
+            bytes,
+            chars,
+            lines,
+            words,
+            max_line_length,
+        }
+    }
+
+    /// Count the UTF-8 characters and words in the given string slice.
+    ///
+    /// `s` is a string slice that is assumed to be a UTF-8 string.
+    fn utf8_word_and_char_count(s: &str) -> (usize, usize) {
+        let word_count = s.split_whitespace().count();
+        let char_count = s.chars().count();
+        (word_count, char_count)
+    }
+
    pub fn with_title(self, title: &str) -> TitledWordCount {
        TitledWordCount { title, count: self }
    }
+
+    /// Count the characters and words in the given slice of bytes.
+    ///
+    /// `line` is a slice of bytes that will be decoded as UTF-8
+    /// characters, or if that fails, as ASCII characters.
+    fn word_and_char_count(line: &[u8]) -> (usize, usize) {
+        // try and convert the bytes to UTF-8 first
+        match from_utf8(line) {
+            Ok(s) => WordCount::utf8_word_and_char_count(s),
+            Err(..) => WordCount::ascii_word_and_char_count(line),
+        }
+    }
 }

 /// This struct supplements the actual word count with a title that is displayed