1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 03:27:44 +00:00

Merge pull request #1495 from orottier/wc

wc: Do not decode UTF8 when only counting bytes/newlines
This commit is contained in:
Sylvestre Ledru 2020-05-04 10:54:18 +02:00 committed by GitHub
commit 06fe387c76
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -146,6 +146,9 @@ fn wc(files: Vec<String>, settings: &Settings) -> StdResult<(), i32> {
let mut results = vec![]; let mut results = vec![];
let mut max_width: usize = 0; let mut max_width: usize = 0;
// we do not need to decode the byte stream if we're only counting bytes/newlines
let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length;
for path in &files { for path in &files {
let mut reader = open(&path[..])?; let mut reader = open(&path[..])?;
@ -173,24 +176,26 @@ fn wc(files: Vec<String>, settings: &Settings) -> StdResult<(), i32> {
byte_count += raw_line.len(); byte_count += raw_line.len();
// try and convert the bytes to UTF-8 first if decode_chars {
let current_char_count; // try and convert the bytes to UTF-8 first
match from_utf8(&raw_line[..]) { let current_char_count;
Ok(line) => { match from_utf8(&raw_line[..]) {
word_count += line.split_whitespace().count(); Ok(line) => {
current_char_count = line.chars().count(); word_count += line.split_whitespace().count();
current_char_count = line.chars().count();
}
Err(..) => {
word_count += raw_line.split(|&x| is_word_seperator(x)).count();
current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count()
}
} }
Err(..) => { char_count += current_char_count;
word_count += raw_line.split(|&x| is_word_seperator(x)).count();
current_char_count = raw_line.iter().filter(|c| c.is_ascii()).count()
}
}
char_count += current_char_count;
if current_char_count > longest_line_length { if current_char_count > longest_line_length {
// we subtract one here because `line.len()` includes the LF // we subtract one here because `line.len()` includes the LF
// matches GNU 'wc' behaviour // matches GNU 'wc' behaviour
longest_line_length = current_char_count - 1; longest_line_length = current_char_count - 1;
}
} }
raw_line.truncate(0); raw_line.truncate(0);