From e3bd6b68487a9ffaa4bd322b19bfe7fd903b8906 Mon Sep 17 00:00:00 2001 From: Otto Rottier Date: Sun, 3 May 2020 20:37:54 +0200 Subject: [PATCH] wc: Do not decode UTF8 when only counting bytes/newlines --- src/uu/wc/src/wc.rs | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/src/uu/wc/src/wc.rs b/src/uu/wc/src/wc.rs index 8c70ff9f4..512f78d74 100644 --- a/src/uu/wc/src/wc.rs +++ b/src/uu/wc/src/wc.rs @@ -146,6 +146,10 @@ fn wc(files: Vec, settings: &Settings) -> StdResult<(), i32> { let mut results = vec![]; let mut max_width: usize = 0; + // we do not need to decode the byte stream if we're only counting bytes/newlines + let decode_chars = settings.show_chars + || settings.show_words || settings.show_max_line_length; + for path in &files { let mut reader = open(&path[..])?; @@ -173,24 +177,26 @@ fn wc(files: Vec, settings: &Settings) -> StdResult<(), i32> { byte_count += raw_line.len(); - // try and convert the bytes to UTF-8 first - let current_char_count; - match from_utf8(&raw_line[..]) { - Ok(line) => { - word_count += line.split_whitespace().count(); - current_char_count = line.chars().count(); + if decode_chars { + // try and convert the bytes to UTF-8 first + let current_char_count; + match from_utf8(&raw_line[..]) { + Ok(line) => { + word_count += line.split_whitespace().count(); + current_char_count = line.chars().count(); + } + Err(..) => { + word_count += raw_line.split(|&x| is_word_seperator(x)).count(); + current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count() + } } - Err(..) => { - word_count += raw_line.split(|&x| is_word_seperator(x)).count(); - current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count() - } - } - char_count += current_char_count; + char_count += current_char_count; - if current_char_count > longest_line_length { - // we subtract one here because `line.len()` includes the LF - // matches GNU 'wc' behaviour - longest_line_length = current_char_count - 1; + if current_char_count > longest_line_length { + // we subtract one here because `line.len()` includes the LF + // matches GNU 'wc' behaviour + longest_line_length = current_char_count - 1; + } } raw_line.truncate(0);